SSE2 single-precision kernels
authorErik Lindahl <erik@kth.se>
Tue, 13 Nov 2012 22:21:13 +0000 (23:21 +0100)
committerMark Abraham <mark.j.abraham@gmail.com>
Fri, 16 Nov 2012 15:37:44 +0000 (16:37 +0100)
New single-precision x86 kernels rewritten to use intrinsics
instead of raw assembly. SSE2 is now our lowest-supported
accelerated instruction set on x86. We generate kernels for
the same interaction choices as the plain c kernels, but
have not enabled Buckingham interactions SSE kernels since
that would generate a whole lot more kernels that would only
be use in rare circumstances. The neighborsearching routine
has also been updated to optionally generate lists that
are padded up to the simd width, which is set by each
kernel when we initialize the neighborlist.

Change-Id: Ic51078e10e7f1f29af3d33e256177806ec208646

122 files changed:
CMakeLists.txt
include/ns.h
include/types/nblist.h
src/config.h.cmakein
src/gmxlib/nonbonded/CMakeLists.txt
src/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/make_nb_kernel_sse2_single.py [new file with mode: 0755]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwCSTab_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwLJSh_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwLJSw_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwLJ_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_sse2_single.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_sse2_single.h [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_template_sse2_single.pre [new file with mode: 0644]
src/gmxlib/nonbonded/nonbonded.c
src/kernel/md.c
src/mdlib/force.c
src/mdlib/ns.c
src/tools/addconf.c

index 04ed33cd29b7672782144fc8be88f9fdd82223c5..24b090f2a803715c09f949e8f16b758cceea344b 100644 (file)
@@ -167,16 +167,16 @@ option(GMX_FAHCORE "Build a library with mdrun functionality" OFF)
 mark_as_advanced(GMX_FAHCORE)
 
 include(gmxDetectAcceleration)
-if(NOT DEFINED GMX_ACCELERATION)
+if(NOT DEFINED GMX_CPU_ACCELERATION)
     if(CMAKE_CROSSCOMPILING)
-        set(GMX_SUGGESTED_ACCELERATION "None")
+        set(GMX_SUGGESTED_CPU_ACCELERATION "None")
     else(CMAKE_CROSSCOMPILING)
-        gmx_detect_acceleration(GMX_SUGGESTED_ACCELERATION)
+        gmx_detect_acceleration(GMX_SUGGESTED_CPU_ACCELERATION)
     endif(CMAKE_CROSSCOMPILING)
-endif(NOT DEFINED GMX_ACCELERATION)
+endif(NOT DEFINED GMX_CPU_ACCELERATION)
 
-set(GMX_ACCELERATION "@GMX_SUGGESTED_ACCELERATION@"
-    CACHE STRING "Accelerated kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, BlueGene, Power6, Fortran")
+set(GMX_CPU_ACCELERATION "@GMX_SUGGESTED_CPU_ACCELERATION@"
+    CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, BlueGene, Power6, Fortran")
 
 set(GMX_FFT_LIBRARY "fftw3" 
     CACHE STRING "FFT library choices: fftw3,mkl,fftpack[built-in]")
@@ -343,11 +343,11 @@ if(GMX_OPENMM)
         set(GMX_SOFTWARE_INVSQRT OFF CACHE STRING 
                 "The OpenMM build does not need GROMACS software 1/sqrt!" FORCE)
     endif(GMX_SOFTWARE_INVSQRT)
-    string(TOUPPER ${GMX_ACCELERATION} GMX_ACCELERATION)
-    if(NOT GMX_ACCELERATION STREQUAL "NONE")
-        message(STATUS "Switching off CPU-based acceleration, the OpenMM build does not support/need any!")    
-        set(GMX_ACCELERATION "none" CACHE STRING 
-               "Switching off CPU-based acceleration, the OpenMM build does not support/need any!" FORCE)
+    string(TOUPPER ${GMX_CPU_ACCELERATION} GMX_CPU_ACCELERATION)
+    if(NOT GMX_CPU_ACCELERATION STREQUAL "NONE")
+        message(STATUS "Switching off CPU-based acceleration, the OpenMM build does not support/need any!")
+        set(GMX_CPU_ACCELERATION "None" CACHE STRING
+            "Switching off CPU-based acceleration, the OpenMM build does not support/need any!" FORCE)
     endif()
     if(GMX_FAHCORE)
         message(FATAL_ERROR "The OpenMM build does not support FAH build!")
@@ -356,10 +356,10 @@ if(GMX_OPENMM)
         message(FATAL_ERROR  "The OpenMM-build does not support double precision calculations!")
     endif()
     # mark as advanced the unused variables
-    mark_as_advanced(FORCE GMX_ACCELERATION GMX_MPI GMX_FFT_LIBRARY 
+    mark_as_advanced(FORCE GMX_CPU_ACCELERATION GMX_MPI GMX_FFT_LIBRARY 
         GMX_QMMM_PROGRAM GMX_THREAD_MPI GMX_DOUBLE)
 else(GMX_OPENMM)
-     mark_as_advanced(CLEAR GMX_ACCELERATION GMX_MPI GMX_FFT_LIBRARY 
+     mark_as_advanced(CLEAR GMX_CPU_ACCELERATION GMX_MPI GMX_FFT_LIBRARY 
         GMX_QMMM_PROGRAM GMX_THREAD_MPI GMX_DOUBLE)
 endif(GMX_OPENMM)
 
@@ -695,10 +695,10 @@ if(NOT GMX_SYSTEM_XDR)
 endif(NOT GMX_SYSTEM_XDR)
 
 # Process nonbonded accelerated kernels settings
-string(TOUPPER ${GMX_ACCELERATION} ${GMX_ACCELERATION})
-if(${GMX_ACCELERATION} STREQUAL "NONE")
+string(TOUPPER ${GMX_CPU_ACCELERATION} ${GMX_CPU_ACCELERATION})
+if(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
     # nothing to do
-elseif(${GMX_ACCELERATION} STREQUAL "SSE2")
+elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE2")
 
     GMX_TEST_CFLAG(GNU_SSE2_CFLAG "-msse2" GROMACS_C_FLAGS)
     if(NOT GNU_SSE2_CFLAG)
@@ -721,13 +721,14 @@ elseif(${GMX_ACCELERATION} STREQUAL "SSE2")
         message(FATAL_ERROR "Cannot find emmintrin.h, which is required for SSE2 intrinsics support.")
     endif(NOT HAVE_EMMINTRIN_H)
 
+    set(GMX_CPU_ACCELERATION_X86_SSE2 1)
     # The user should not be able to set this orthogonally to the acceleration
     set(GMX_X86_SSE2 1)
     if (NOT ACCELERATION_QUIETLY)
       message(STATUS "Enabling SSE2 Gromacs acceleration, and it will help compiler optimization.")
     endif()
 
-elseif(${GMX_ACCELERATION} STREQUAL "SSE4.1")
+elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE4.1")
 
     GMX_TEST_CFLAG(GNU_SSE4_CFLAG "-msse4.1" GROMACS_C_FLAGS)
     if (NOT GNU_SSE4_CFLAG)
@@ -760,6 +761,7 @@ elseif(${GMX_ACCELERATION} STREQUAL "SSE4.1")
         message(FATAL_ERROR "Cannot find smmintrin.h, which is required for SSE4.1 intrinsics support.")
     endif(NOT HAVE_SMMINTRIN_H)
 
+    set(GMX_CPU_ACCELERATION_X86_SSE4_1 1)
     # The user should not be able to set this orthogonally to the acceleration
     set(GMX_X86_SSE4_1 1)
     set(GMX_X86_SSE2   1)
@@ -767,7 +769,7 @@ elseif(${GMX_ACCELERATION} STREQUAL "SSE4.1")
       message(STATUS "Enabling SSE4.1 Gromacs acceleration, and it will help compiler optimization.")
     endif()
 
-elseif(${GMX_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_ACCELERATION} STREQUAL "AVX_256")
+elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_CPU_ACCELERATION} STREQUAL "AVX_256")
 
     # Set the AVX compiler flag for both these choices!
 
@@ -790,7 +792,7 @@ elseif(${GMX_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_ACCELERATION} STREQUA
     endif()
 
     # Set the FMA4 flags (MSVC doesn't require any)
-    if(${GMX_ACCELERATION} STREQUAL "AVX_128_FMA" AND NOT MSVC)
+    if(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" AND NOT MSVC)
         GMX_TEST_CFLAG(GNU_FMA_CFLAG "-mfma4" GROMACS_C_FLAGS)
         if (NOT GNU_FMA_CFLAG)
             message(WARNING "No C FMA4 flag found. Consider a newer compiler, or disable AVX_128_FMA for much lower performance.")
@@ -828,27 +830,29 @@ elseif(${GMX_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_ACCELERATION} STREQUA
     set(GMX_X86_SSE2   1)
 
     # But just enable one of the choices internally...
-    if(${GMX_ACCELERATION} STREQUAL "AVX_128_FMA")
+    if(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA")
+        set(GMX_CPU_ACCELERATION_X86_AVX_128_FMA 1)
         set(GMX_X86_AVX_128_FMA 1)
         if (NOT ACCELERATION_QUIETLY)
           message(STATUS "Enabling 128-bit AVX Gromacs acceleration (with fused-multiply add), and it will help compiler optimization.")
         endif()
     else()
         # If we are not doing AVX_128, it must be AVX_256...
+        set(GMX_CPU_ACCELERATION_X86_AVX_256 1)
         set(GMX_X86_AVX_256 1)
         if (NOT ACCELERATION_QUIETLY)
           message(STATUS "Enabling 256-bit AVX Gromacs acceleration, and it will help compiler optimization.")
         endif()
     endif()
 
-elseif(${GMX_ACCELERATION} STREQUAL "FORTRAN")
+elseif(${GMX_CPU_ACCELERATION} STREQUAL "FORTRAN")
 
 #    Fortran is temporarily disabled while we push in nbNxN kernels.
 #    We need to fake it a bit here to avoid jenkins build errors!
 #    add_definitions(-DGMX_FORTRAN)
 
-elseif(${GMX_ACCELERATION} STREQUAL "BLUEGENE")
-# GMX_ACCELERATION=BlueGene should be set in the Toolchain-BlueGene?-???.cmake file
+elseif(${GMX_CPU_ACCELERATION} STREQUAL "BLUEGENE")
+# GMX_CPU_ACCELERATION=BlueGene should be set in the Toolchain-BlueGene?-???.cmake file
     if (NOT ACCELERATION_QUIETLY)
       message(STATUS "Configuring for BlueGene")
     endif()
@@ -869,13 +873,13 @@ elseif(${GMX_ACCELERATION} STREQUAL "BLUEGENE")
 # The automatic testing for endianness does not work for the BlueGene cross-compiler
     set(GMX_IEEE754_BIG_ENDIAN_BYTE_ORDER 1 CACHE INTERNAL "BlueGene has big endian FP byte order (by default)" FORCE)
     set(GMX_IEEE754_BIG_ENDIAN_WORD_ORDER 1 CACHE INTERNAL "BlueGene has big endian FP word order (by default)" FORCE)
-elseif(${GMX_ACCELERATION} STREQUAL "POWER6")
+elseif(${GMX_CPU_ACCELERATION} STREQUAL "POWER6")
     set(GMX_POWER6 1)
     set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on Power6" FORCE)
     set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on Power6" FORCE)
-else(${GMX_ACCELERATION} STREQUAL "NONE")
-    MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, Fortran, BlueGene, Power6")
-endif(${GMX_ACCELERATION} STREQUAL "NONE")
+else(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
+    MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, Fortran, BlueGene, Power6")
+endif(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
 set(ACCELERATION_QUIETLY TRUE CACHE INTERNAL "")
 
 if(GMX_FORTRAN OR GMX_POWER6)
@@ -952,9 +956,9 @@ if(${GMX_FFT_LIBRARY} STREQUAL "FFTW3")
 
     set(GMX_FFT_FFTW3 1)
 
-    if (NOT ${GMX_ACCELERATION} STREQUAL "NONE" AND NOT ${FFTW}_HAVE_SIMD) 
+    if (NOT ${GMX_CPU_ACCELERATION} STREQUAL "NONE" AND NOT ${FFTW}_HAVE_SIMD) 
       message(WARNING "The fftw library found is compiled without SIMD support, which makes it slow. Consider recompiling it or contact your admin")
-    endif (NOT ${GMX_ACCELERATION} STREQUAL "NONE" AND NOT ${FFTW}_HAVE_SIMD) 
+    endif (NOT ${GMX_CPU_ACCELERATION} STREQUAL "NONE" AND NOT ${FFTW}_HAVE_SIMD) 
 
 elseif(${GMX_FFT_LIBRARY} STREQUAL "MKL")
 #    MESSAGE(STATUS "Using external FFT library - Intel MKL")
index d59af03482f66d3883adfa2a8f2f814ebbe54021..1b40338e3ea940f37432100aa1bdd4e79b6321a2 100644 (file)
@@ -89,7 +89,8 @@ int search_neighbours(FILE *log,t_forcerec *fr,
                             real *lambda,real *dvdlambda,
                             gmx_grppairener_t *grppener,
                             gmx_bool bFillGrid,
-                            gmx_bool bDoLongRangeNS);
+                            gmx_bool bDoLongRangeNS,
+                 gmx_bool bPadListsForKernels);
  
 
 /* Debugging routines from wnblist.c */
index 1e189cd7ef9a3eb5736426ad192eb507694d3b12..c4786511fee8ca70f767e4c11e5b0f208999cece 100644 (file)
@@ -79,6 +79,14 @@ typedef struct
     void *          kernelptr_vf;
     void *          kernelptr_v;
     void *          kernelptr_f;
+
+    /* Pad the list of neighbors for each i atom with "-1" entries up to the
+     * simd_padding_width, if it is larger than 0. This is necessary for many
+     * accelerated kernels using single-instruction multiple-data operations
+     * internally.
+     */
+    int             simd_padding_width;
+
 } t_nblist;
 
 
index a5024ef659bdb8339049a0da9f279302b23f9e41..14cbbce89e545bcf1e4baf080ef2f28d326d6881 100644 (file)
 /* Use AMD core math library */
 #cmakedefine GMX_FFT_ACML
 
-/* What type of acceleration is used? (string, for dumping to files) */
-#define GMX_ACCELERATION "@GMX_ACCELERATION@"
-
-/* SSE2 acceleration */
+/* SSE2 instructions available */
 #cmakedefine GMX_X86_SSE2
 
-/* SSE4.1 acceleration */
+/* SSE4.1 instructions available */
 #cmakedefine GMX_X86_SSE4_1
 
-/* AVX 128-bit acceleration with FMA, useful on modern AMD hardware */
+/* AVX 128-bit FMA instructions available */
 #cmakedefine GMX_X86_AVX_128_FMA
 
-/* AVX 256-bit acceleration, usually for intel hardware */
+/* AVX 256-bit instructions available */
 #cmakedefine GMX_X86_AVX_256
 
+/* SSE2 was selected as CPU acceleration level */
+#cmakedefine GMX_CPU_ACCELERATION_X86_SSE2
+
+/* SSE4.1 was selected as CPU acceleration level */
+#cmakedefine GMX_CPU_ACCELERATION_X86_SSE4_1
+
+/* AVX 128-bit FMA was selected as CPU acceleration level */
+#cmakedefine GMX_CPU_ACCELERATION_X86_AVX_128_FMA
+
+/* AVX 256-bit was selected as CPU acceleration level */
+#cmakedefine GMX_CPU_ACCELERATION_X86_AVX_256
+
+/* String for CPU acceleration choice (for writing to log files and stdout) */
+#define GMX_CPU_ACCELERATION_STRING "@GMX_CPU_ACCELERATION@"
+
 /* Integer byte order is big endian. */
 #cmakedefine GMX_INTEGER_BIG_ENDIAN 
 
index 74bb4f80e36f22d4cbb315c359a824569bafb2ed..350d054ec3610a31a0ae4be87c8b8959d912ebb5 100644 (file)
@@ -1,8 +1,16 @@
 # Sources that should always be built
 file(GLOB NONBONDED_SOURCES *.c nb_kernel_c/*.c)
 
+# For now we enable the (existing) SSE2 kernels for all accelerations
+if(((GMX_CPU_ACCELERATION STREQUAL "SSE2") OR
+    (GMX_CPU_ACCELERATION STREQUAL "SSE4.1") OR
+    (GMX_CPU_ACCELERATION STREQUAL "AVX_128_FMA") OR
+    (GMX_CPU_ACCELERATION STREQUAL "AVX_256")) AND NOT GMX_DOUBLE)
+    file(GLOB NONBONDED_SSE2_SINGLE_SOURCES nb_kernel_sse2_single/*.c)
+endif()
+
 # These sources will be used in the parent directory's CMakeLists.txt
-set(NONBONDED_SOURCES ${NONBONDED_SOURCES} PARENT_SCOPE)
+set(NONBONDED_SOURCES ${NONBONDED_SOURCES} ${NONBONDED_SSE2_SINGLE_SOURCES} PARENT_SCOPE)
 
 
 
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h b/src/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h
new file mode 100644 (file)
index 0000000..ad6c21a
--- /dev/null
@@ -0,0 +1,1021 @@
+/*
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2011-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under 
+ * the terms of the GNU Lesser General Public License as published by the Free 
+ * Software Foundation; either version 2 of the License, or (at your option) any 
+ * later version.
+ * As a special exception, you may use this file as part of a free software
+ * library without restriction.  Specifically, if other files instantiate
+ * templates or use macros or inline functions from this file, or you compile
+ * this file and link it with other files to produce an executable, this
+ * file does not by itself cause the resulting executable to be covered by
+ * the GNU Lesser General Public License.  
+ *
+ * In plain-speak: do not worry about classes/macros/templates either - only
+ * changes to the library have to be LGPL, not an application linking with it.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website!
+ */
+#ifndef _kernelutil_x86_sse2_single_h_
+#define _kernelutil_x86_sse2_single_h_
+
+/* We require SSE2 now! */
+
+#include <math.h> 
+
+#include "gmx_x86_sse2.h"
+
+
+/* Normal sum of four xmm registers */
+#define gmx_mm_sum4_ps(t0,t1,t2,t3)  _mm_add_ps(_mm_add_ps(t0,t1),_mm_add_ps(t2,t3))
+
+static gmx_inline __m128
+gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
+{
+    return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx,dx), _mm_mul_ps(dy,dy) ), _mm_mul_ps(dz,dz) );
+}
+
+static int
+gmx_mm_any_lt(__m128 a, __m128 b)
+{
+    return _mm_movemask_ps(_mm_cmplt_ps(a,b));
+}
+
+/* Load a single value from 1-4 places, merge into xmm register */
+
+static __m128
+gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
+                             const float * gmx_restrict ptrB,
+                             const float * gmx_restrict ptrC,
+                             const float * gmx_restrict ptrD)
+{
+    __m128 t1,t2;
+
+    t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA),_mm_load_ss(ptrC));
+    t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB),_mm_load_ss(ptrD));
+    return _mm_unpacklo_ps(t1,t2);
+}
+
+static void
+gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
+                              float * gmx_restrict ptrB,
+                              float * gmx_restrict ptrC,
+                              float * gmx_restrict ptrD,
+                              __m128 xmm1)
+{
+    __m128 t2,t3,t4;
+
+    t3       = _mm_movehl_ps(_mm_setzero_ps(),xmm1);
+    t2       = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
+    t4       = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
+    _mm_store_ss(ptrA,xmm1);
+    _mm_store_ss(ptrB,t2);
+    _mm_store_ss(ptrC,t3);
+    _mm_store_ss(ptrD,t4);
+}
+
+/* Similar to store, but increments value in memory */
+static void
+gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
+                                  float * gmx_restrict ptrB,
+                                  float * gmx_restrict ptrC,
+                                  float * gmx_restrict ptrD, __m128 xmm1)
+{
+    __m128 tmp;
+
+    tmp = gmx_mm_load_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD);
+    tmp = _mm_add_ps(tmp,xmm1);
+    gmx_mm_store_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD,tmp);
+}
+
+
+static void
+gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
+                             const float * gmx_restrict p2,
+                             const float * gmx_restrict p3,
+                             const float * gmx_restrict p4,
+                             __m128 * gmx_restrict c6,
+                             __m128 * gmx_restrict c12)
+{
+    __m128 t1,t2,t3,t4;
+
+    t1   = _mm_castpd_ps(_mm_load_sd((const double *)p1));
+    t2   = _mm_castpd_ps(_mm_load_sd((const double *)p2));
+    t3   = _mm_castpd_ps(_mm_load_sd((const double *)p3));
+    t4   = _mm_castpd_ps(_mm_load_sd((const double *)p4));
+    t1   = _mm_unpacklo_ps(t1,t2);
+    t2   = _mm_unpacklo_ps(t3,t4);
+    *c6  = _mm_movelh_ps(t1,t2);
+    *c12 = _mm_movehl_ps(t2,t1);
+}
+
+/* Routines to load 1-4 rvec from 4 places.
+ * We mainly use these to load coordinates. The extra routines
+ * are very efficient for the water-water loops, since we e.g.
+ * know that a TIP4p water has 4 atoms, so we should load 12 floats+shuffle.
+ */
+
+
+static void
+gmx_mm_load_1rvec_broadcast_ps(float *ptrA, __m128 *x, __m128 *y, __m128 *z)
+{
+    __m128 t1;
+
+    t1 = _mm_loadu_ps(ptrA);
+
+    *x = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
+    *y = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
+    *z = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
+}
+
+static void
+gmx_mm_load_3rvec_broadcast_ps(float *ptrA,
+                               __m128 *x1, __m128 *y1, __m128 *z1,
+                               __m128 *x2, __m128 *y2, __m128 *z2,
+                               __m128 *x3, __m128 *y3, __m128 *z3)
+{
+    __m128 t1,t2,t3;
+
+    t1 = _mm_loadu_ps(ptrA);
+    t2 = _mm_loadu_ps(ptrA+4);
+
+    *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
+    *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
+    *z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
+    *x2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,3,3));
+    *y2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,0));
+    *z2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(1,1,1,1));
+    *x3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(2,2,2,2));
+    *y3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(3,3,3,3));
+
+    t3  = _mm_load_ss(ptrA+8);
+    *z3 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
+}
+
+static void
+gmx_mm_load_4rvec_broadcast_ps(float *ptrA,
+                               __m128 *x1, __m128 *y1, __m128 *z1,
+                               __m128 *x2, __m128 *y2, __m128 *z2,
+                               __m128 *x3, __m128 *y3, __m128 *z3,
+                               __m128 *x4, __m128 *y4, __m128 *z4)
+{
+    __m128 t1,t2,t3;
+    __m128 tA;
+
+    t1 = _mm_loadu_ps(ptrA);
+    t2 = _mm_loadu_ps(ptrA+4);
+    t3 = _mm_loadu_ps(ptrA+8);
+
+    *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
+    *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
+    *z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
+    *x2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,3,3));
+    *y2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,0));
+    *z2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(1,1,1,1));
+    *x3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(2,2,2,2));
+    *y3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(3,3,3,3));
+    *z3 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
+    *x4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
+    *y4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(2,2,2,2));
+    *z4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(3,3,3,3));
+}
+
+
+static void
+gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
+                                  const float * gmx_restrict ptrB,
+                                  const float * gmx_restrict ptrC,
+                                  const float * gmx_restrict ptrD,
+                                  __m128 *      gmx_restrict x1,
+                                  __m128 *      gmx_restrict y1,
+                                  __m128 *      gmx_restrict z1)
+{
+    __m128 t1,t2,t3,t4,t5,t6,t7,t8;
+    t1   = _mm_castpd_ps(_mm_load_sd((const double *)ptrA));
+    t2   = _mm_castpd_ps(_mm_load_sd((const double *)ptrB));
+    t3   = _mm_castpd_ps(_mm_load_sd((const double *)ptrC));
+    t4   = _mm_castpd_ps(_mm_load_sd((const double *)ptrD));
+    t5 = _mm_load_ss(ptrA+2);
+    t6 = _mm_load_ss(ptrB+2);
+    t7 = _mm_load_ss(ptrC+2);
+    t8 = _mm_load_ss(ptrD+2);
+    t1 = _mm_unpacklo_ps(t1,t2);
+    t3 = _mm_unpacklo_ps(t3,t4);
+    *x1 = _mm_movelh_ps(t1,t3);
+    *y1 = _mm_movehl_ps(t3,t1);
+    t5  = _mm_unpacklo_ps(t5,t6);
+    t7  = _mm_unpacklo_ps(t7,t8);
+    *z1 = _mm_movelh_ps(t5,t7);
+}
+
+
+static void
+gmx_mm_load_2rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
+                                  __m128 *x1, __m128 *y1, __m128 *z1,
+                                  __m128 *x2, __m128 *y2, __m128 *z2)
+{
+    __m128 t1,t2,t3,t4;
+    t1            = _mm_loadu_ps(ptrA);
+    t2            = _mm_loadu_ps(ptrB);
+    t3            = _mm_loadu_ps(ptrC);
+    t4            = _mm_loadu_ps(ptrD);
+    _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
+    *x1           = t1;
+    *y1           = t2;
+    *z1           = t3;
+    *x2           = t4;
+    t1            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrA+4));
+    t2            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrB+4));
+    t3            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrC+4));
+    t4            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrD+4));
+    t1            = _mm_unpacklo_ps(t1,t3);
+    t2            = _mm_unpacklo_ps(t2,t4);
+    *y2           = _mm_unpacklo_ps(t1,t2);
+    *z2           = _mm_unpackhi_ps(t1,t2);
+}
+
+
+static void
+gmx_mm_load_3rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
+                                  __m128 *x1, __m128 *y1, __m128 *z1,
+                                  __m128 *x2, __m128 *y2, __m128 *z2,
+                                  __m128 *x3, __m128 *y3, __m128 *z3) 
+{
+    __m128 t1,t2,t3,t4;
+    t1            = _mm_loadu_ps(ptrA);
+    t2            = _mm_loadu_ps(ptrB);
+    t3            = _mm_loadu_ps(ptrC);
+    t4            = _mm_loadu_ps(ptrD);
+    _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
+    *x1           = t1;
+    *y1           = t2;
+    *z1           = t3;
+    *x2           = t4;
+    t1            = _mm_loadu_ps(ptrA+4);
+    t2            = _mm_loadu_ps(ptrB+4);
+    t3            = _mm_loadu_ps(ptrC+4);
+    t4            = _mm_loadu_ps(ptrD+4);
+    _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
+    *y2           = t1;
+    *z2           = t2;
+    *x3           = t3;
+    *y3           = t4;
+    t1            = _mm_load_ss(ptrA+8);
+    t2            = _mm_load_ss(ptrB+8);
+    t3            = _mm_load_ss(ptrC+8);
+    t4            = _mm_load_ss(ptrD+8);
+    t1            = _mm_unpacklo_ps(t1,t3);
+    t3            = _mm_unpacklo_ps(t2,t4);
+    *z3           = _mm_unpacklo_ps(t1,t3);
+}
+
+
+static void
+gmx_mm_load_4rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
+                                  __m128 *x1, __m128 *y1, __m128 *z1,
+                                  __m128 *x2, __m128 *y2, __m128 *z2,
+                                  __m128 *x3, __m128 *y3, __m128 *z3,
+                                  __m128 *x4, __m128 *y4, __m128 *z4) 
+{
+    __m128 t1,t2,t3,t4;
+    t1            = _mm_loadu_ps(ptrA);
+    t2            = _mm_loadu_ps(ptrB);
+    t3            = _mm_loadu_ps(ptrC);
+    t4            = _mm_loadu_ps(ptrD);
+    _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
+    *x1           = t1;
+    *y1           = t2;
+    *z1           = t3;
+    *x2           = t4;
+    t1            = _mm_loadu_ps(ptrA+4);
+    t2            = _mm_loadu_ps(ptrB+4);
+    t3            = _mm_loadu_ps(ptrC+4);
+    t4            = _mm_loadu_ps(ptrD+4);
+    _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
+    *y2           = t1;
+    *z2           = t2;
+    *x3           = t3;
+    *y3           = t4;
+    t1            = _mm_loadu_ps(ptrA+8);
+    t2            = _mm_loadu_ps(ptrB+8);
+    t3            = _mm_loadu_ps(ptrC+8);
+    t4            = _mm_loadu_ps(ptrD+8);
+    _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
+    *z3           = t1;
+    *x4           = t2;
+    *y4           = t3;
+    *z4           = t4;
+}
+
+
+/* Routines to increment rvec in memory, typically use for j particle force updates */
+static void
+gmx_mm_increment_1rvec_1ptr_noswizzle_ps(float *ptrA, __m128 xyz)
+{
+    __m128 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,-1,-1,-1) );
+    __m128 t1;
+
+    t1  = _mm_loadu_ps(ptrA);
+    xyz = _mm_and_ps(mask,xyz);
+    t1  = _mm_add_ps(t1,xyz);
+    _mm_storeu_ps(ptrA,t1);
+}
+
+
+static void
+gmx_mm_increment_3rvec_1ptr_noswizzle_ps(float *ptrA, 
+                                         __m128 xyz1, __m128 xyz2, __m128 xyz3)
+{
+    __m128 t1,t2,t3,t4;
+    __m128 tA,tB,tC;
+
+    tA   = _mm_loadu_ps(ptrA);
+    tB   = _mm_loadu_ps(ptrA+4);
+    tC   = _mm_load_ss(ptrA+8);
+
+    t1   = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
+    t2   = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
+
+    t3   = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3));   /* z1 z1 x2 x2 */
+    t3   = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
+
+    t4   = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
+
+    tA   = _mm_add_ps(tA,t3);
+    tB   = _mm_add_ps(tB,t4);
+    tC   = _mm_add_ss(tC,t2);
+
+    _mm_storeu_ps(ptrA,tA);
+    _mm_storeu_ps(ptrA+4,tB);
+    _mm_store_ss(ptrA+8,tC);
+}
+
+static void
+gmx_mm_increment_4rvec_1ptr_noswizzle_ps(float *ptrA, 
+                                         __m128 xyz1, __m128 xyz2, __m128 xyz3, __m128 xyz4)
+{
+    __m128 t1,t2,t3,t4,t5;
+    __m128 tA,tB,tC;
+
+    tA   = _mm_loadu_ps(ptrA);
+    tB   = _mm_loadu_ps(ptrA+4);
+    tC   = _mm_loadu_ps(ptrA+8);
+
+    t1   = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
+    t2   = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
+
+    t3   = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3));   /* z1 z1 x2 x2 */
+    t3   = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
+
+    t4   = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
+    t5   = _mm_shuffle_ps(xyz4,xyz4,_MM_SHUFFLE(2,1,0,0)); /* z4 y4 x4 - */
+
+    t2   = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(1,1,0,0));  /*  x4 x4 z3 z3 */
+    t5   = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(3,2,2,0)); /* z4 y4 x4 z3 */
+
+    tA   = _mm_add_ps(tA,t3);
+    tB   = _mm_add_ps(tB,t4);
+    tC   = _mm_add_ps(tC,t5);
+
+    _mm_storeu_ps(ptrA,tA);
+    _mm_storeu_ps(ptrA+4,tB);
+    _mm_storeu_ps(ptrA+8,tC);
+
+}
+
+
+
+static void
+gmx_mm_increment_1rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC,float *ptrD,
+                                       __m128 x1, __m128 y1, __m128 z1) 
+{
+    __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
+    t5          = _mm_unpacklo_ps(y1,z1);
+    t6          = _mm_unpackhi_ps(y1,z1);
+    t7          = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(1,0,0,0));
+    t8          = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(3,2,0,1));
+    t9          = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(1,0,0,2));
+    t10         = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(3,2,0,3));
+    t1          = _mm_load_ss(ptrA);
+    t1          = _mm_loadh_pi(t1,(__m64 *)(ptrA+1));
+    t1          = _mm_add_ps(t1,t7);
+    _mm_store_ss(ptrA,t1);
+    _mm_storeh_pi((__m64 *)(ptrA+1),t1);
+    t2          = _mm_load_ss(ptrB);
+    t2          = _mm_loadh_pi(t2,(__m64 *)(ptrB+1));
+    t2          = _mm_add_ps(t2,t8);
+    _mm_store_ss(ptrB,t2);
+    _mm_storeh_pi((__m64 *)(ptrB+1),t2);
+    t3          = _mm_load_ss(ptrC);
+    t3          = _mm_loadh_pi(t3,(__m64 *)(ptrC+1));
+    t3          = _mm_add_ps(t3,t9);
+    _mm_store_ss(ptrC,t3);
+    _mm_storeh_pi((__m64 *)(ptrC+1),t3);
+    t4          = _mm_load_ss(ptrD);
+    t4          = _mm_loadh_pi(t4,(__m64 *)(ptrD+1));
+    t4          = _mm_add_ps(t4,t10);
+    _mm_store_ss(ptrD,t4);
+    _mm_storeh_pi((__m64 *)(ptrD+1),t4);
+}
+
+
+
+
+static void
+gmx_mm_increment_3rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
+                                       __m128 x1, __m128 y1, __m128 z1,
+                                       __m128 x2, __m128 y2, __m128 z2,
+                                       __m128 x3, __m128 y3, __m128 z3) 
+{
+    __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+    __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
+    __m128 t20,t21,t22,t23,t24,t25;
+
+    t13         = _mm_unpackhi_ps(x1,y1);
+    x1          = _mm_unpacklo_ps(x1,y1);
+    t14         = _mm_unpackhi_ps(z1,x2);
+    z1          = _mm_unpacklo_ps(z1,x2);
+    t15         = _mm_unpackhi_ps(y2,z2);
+    y2          = _mm_unpacklo_ps(y2,z2);
+    t16         = _mm_unpackhi_ps(x3,y3);
+    x3          = _mm_unpacklo_ps(x3,y3);
+    t17         = _mm_shuffle_ps(z3,z3,_MM_SHUFFLE(0,0,0,1));
+    t18         = _mm_movehl_ps(z3,z3);
+    t19         = _mm_shuffle_ps(t18,t18,_MM_SHUFFLE(0,0,0,1));
+    t20         = _mm_movelh_ps(x1,z1);
+    t21         = _mm_movehl_ps(z1,x1);
+    t22         = _mm_movelh_ps(t13,t14);
+    t14         = _mm_movehl_ps(t14,t13);
+    t23         = _mm_movelh_ps(y2,x3);
+    t24         = _mm_movehl_ps(x3,y2);
+    t25         = _mm_movelh_ps(t15,t16);
+    t16         = _mm_movehl_ps(t16,t15);
+    t1          = _mm_loadu_ps(ptrA);
+    t2          = _mm_loadu_ps(ptrA+4);
+    t3          = _mm_load_ss(ptrA+8);
+    t1          = _mm_add_ps(t1,t20);
+    t2          = _mm_add_ps(t2,t23);
+    t3          = _mm_add_ss(t3,z3);
+    _mm_storeu_ps(ptrA,t1);
+    _mm_storeu_ps(ptrA+4,t2);
+    _mm_store_ss(ptrA+8,t3);
+    t4          = _mm_loadu_ps(ptrB);
+    t5          = _mm_loadu_ps(ptrB+4);
+    t6          = _mm_load_ss(ptrB+8);
+    t4          = _mm_add_ps(t4,t21);
+    t5          = _mm_add_ps(t5,t24);
+    t6          = _mm_add_ss(t6,t17);
+    _mm_storeu_ps(ptrB,t4);
+    _mm_storeu_ps(ptrB+4,t5);
+    _mm_store_ss(ptrB+8,t6);
+    t7          = _mm_loadu_ps(ptrC);
+    t8          = _mm_loadu_ps(ptrC+4);
+    t9          = _mm_load_ss(ptrC+8);
+    t7          = _mm_add_ps(t7,t22);
+    t8          = _mm_add_ps(t8,t25);
+    t9          = _mm_add_ss(t9,t18);
+    _mm_storeu_ps(ptrC,t7);
+    _mm_storeu_ps(ptrC+4,t8);
+    _mm_store_ss(ptrC+8,t9);
+    t10         = _mm_loadu_ps(ptrD);
+    t11         = _mm_loadu_ps(ptrD+4);
+    t12         = _mm_load_ss(ptrD+8);
+    t10         = _mm_add_ps(t10,t14);
+    t11         = _mm_add_ps(t11,t16);
+    t12         = _mm_add_ss(t12,t19);
+    _mm_storeu_ps(ptrD,t10);
+    _mm_storeu_ps(ptrD+4,t11);
+    _mm_store_ss(ptrD+8,t12);
+}
+
+
+static void
+gmx_mm_increment_4rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
+                                       __m128 x1, __m128 y1, __m128 z1,
+                                       __m128 x2, __m128 y2, __m128 z2,
+                                       __m128 x3, __m128 y3, __m128 z3,
+                                       __m128 x4, __m128 y4, __m128 z4) 
+{
+    __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
+    __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
+    __m128 t23,t24;
+    t13         = _mm_unpackhi_ps(x1,y1);
+    x1          = _mm_unpacklo_ps(x1,y1);
+    t14         = _mm_unpackhi_ps(z1,x2);
+    z1          = _mm_unpacklo_ps(z1,x2);
+    t15         = _mm_unpackhi_ps(y2,z2);
+    y2          = _mm_unpacklo_ps(y2,z2);
+    t16         = _mm_unpackhi_ps(x3,y3);
+    x3          = _mm_unpacklo_ps(x3,y3);
+    t17         = _mm_unpackhi_ps(z3,x4);
+    z3          = _mm_unpacklo_ps(z3,x4);
+    t18         = _mm_unpackhi_ps(y4,z4);
+    y4          = _mm_unpacklo_ps(y4,z4);
+    t19         = _mm_movelh_ps(x1,z1);
+    z1          = _mm_movehl_ps(z1,x1);
+    t20         = _mm_movelh_ps(t13,t14);
+    t14         = _mm_movehl_ps(t14,t13);
+    t21         = _mm_movelh_ps(y2,x3);
+    x3          = _mm_movehl_ps(x3,y2);
+    t22         = _mm_movelh_ps(t15,t16);
+    t16         = _mm_movehl_ps(t16,t15);
+    t23         = _mm_movelh_ps(z3,y4);
+    y4          = _mm_movehl_ps(y4,z3);
+    t24         = _mm_movelh_ps(t17,t18);
+    t18         = _mm_movehl_ps(t18,t17);
+    t1          = _mm_loadu_ps(ptrA);
+    t2          = _mm_loadu_ps(ptrA+4);
+    t3          = _mm_loadu_ps(ptrA+8);
+    t1          = _mm_add_ps(t1,t19);
+    t2          = _mm_add_ps(t2,t21);
+    t3          = _mm_add_ps(t3,t23);
+    _mm_storeu_ps(ptrA,t1);
+    _mm_storeu_ps(ptrA+4,t2);
+    _mm_storeu_ps(ptrA+8,t3);
+    t4          = _mm_loadu_ps(ptrB);
+    t5          = _mm_loadu_ps(ptrB+4);
+    t6          = _mm_loadu_ps(ptrB+8);
+    t4          = _mm_add_ps(t4,z1);
+    t5          = _mm_add_ps(t5,x3);
+    t6          = _mm_add_ps(t6,y4);
+    _mm_storeu_ps(ptrB,t4);
+    _mm_storeu_ps(ptrB+4,t5);
+    _mm_storeu_ps(ptrB+8,t6);
+    t7          = _mm_loadu_ps(ptrC);
+    t8          = _mm_loadu_ps(ptrC+4);
+    t9          = _mm_loadu_ps(ptrC+8);
+    t7          = _mm_add_ps(t7,t20);
+    t8          = _mm_add_ps(t8,t22);
+    t9          = _mm_add_ps(t9,t24);
+    _mm_storeu_ps(ptrC,t7);
+    _mm_storeu_ps(ptrC+4,t8);
+    _mm_storeu_ps(ptrC+8,t9);
+    t10         = _mm_loadu_ps(ptrD);
+    t11         = _mm_loadu_ps(ptrD+4);
+    t12         = _mm_loadu_ps(ptrD+8);
+    t10         = _mm_add_ps(t10,t14);
+    t11         = _mm_add_ps(t11,t16);
+    t12         = _mm_add_ps(t12,t18);
+    _mm_storeu_ps(ptrD,t10);
+    _mm_storeu_ps(ptrD+4,t11);
+    _mm_storeu_ps(ptrD+8,t12);
+}
+
+
+/* Routines to decrement rvec in memory */
+static void
+gmx_mm_decrement_1rvec_1ptr_noswizzle_ps(float *ptrA, __m128 xyz)
+{
+    __m128 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,-1,-1,-1) );
+    __m128 t1;
+
+    t1  = _mm_loadu_ps(ptrA);
+    xyz = _mm_and_ps(mask,xyz);
+    t1  = _mm_sub_ps(t1,xyz);
+    _mm_storeu_ps(ptrA,t1);
+}
+
+
+static void
+gmx_mm_decrement_3rvec_1ptr_noswizzle_ps(float *ptrA, 
+                                         __m128 xyz1, __m128 xyz2, __m128 xyz3)
+{
+    __m128 t1,t2,t3,t4;
+    __m128 tA,tB,tC;
+
+    tA   = _mm_loadu_ps(ptrA);
+    tB   = _mm_loadu_ps(ptrA+4);
+    tC   = _mm_load_ss(ptrA+8);
+
+    t1   = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
+    t2   = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
+
+    t3   = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3));   /* z1 z1 x2 x2 */
+    t3   = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
+
+    t4   = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
+
+    tA   = _mm_sub_ps(tA,t3);
+    tB   = _mm_sub_ps(tB,t4);
+    tC   = _mm_sub_ss(tC,t2);
+
+    _mm_storeu_ps(ptrA,tA);
+    _mm_storeu_ps(ptrA+4,tB);
+    _mm_store_ss(ptrA+8,tC);
+}
+
+static void
+gmx_mm_decrement_4rvec_1ptr_noswizzle_ps(float *ptrA, 
+                                         __m128 xyz1, __m128 xyz2, __m128 xyz3, __m128 xyz4)
+{
+    __m128 t1,t2,t3,t4,t5;
+    __m128 tA,tB,tC;
+
+    tA   = _mm_loadu_ps(ptrA);
+    tB   = _mm_loadu_ps(ptrA+4);
+    tC   = _mm_loadu_ps(ptrA+8);
+
+    t1   = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
+    t2   = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
+
+    t3   = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3));   /* z1 z1 x2 x2 */
+    t3   = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
+
+    t4   = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
+
+    t5   = _mm_shuffle_ps(xyz4,xyz4,_MM_SHUFFLE(2,1,0,0)); /* z4 y4 x4 - */
+    t2   = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(1,1,0,0));  /*  x4 x4 z3 z3 */
+    t5   = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(3,2,2,0)); /* z4 y4 x4 z3 */
+
+    tA   = _mm_sub_ps(tA,t3);
+    tB   = _mm_sub_ps(tB,t4);
+    tC   = _mm_sub_ps(tC,t5);
+
+    _mm_storeu_ps(ptrA,tA);
+    _mm_storeu_ps(ptrA+4,tB);
+    _mm_storeu_ps(ptrA+8,tC);
+
+}
+
+
+static void
+gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA,
+                                       float * gmx_restrict ptrB,
+                                       float * gmx_restrict ptrC,
+                                       float * gmx_restrict ptrD,
+                                       __m128 x1, __m128 y1, __m128 z1)
+{
+    __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
+    t5          = _mm_unpacklo_ps(y1,z1);
+    t6          = _mm_unpackhi_ps(y1,z1);
+    t7          = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(1,0,0,0));
+    t8          = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(3,2,0,1));
+    t9          = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(1,0,0,2));
+    t10         = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(3,2,0,3));
+    t1          = _mm_load_ss(ptrA);
+    t1          = _mm_loadh_pi(t1,(__m64 *)(ptrA+1));
+    t1          = _mm_sub_ps(t1,t7);
+    _mm_store_ss(ptrA,t1);
+    _mm_storeh_pi((__m64 *)(ptrA+1),t1);
+    t2          = _mm_load_ss(ptrB);
+    t2          = _mm_loadh_pi(t2,(__m64 *)(ptrB+1));
+    t2          = _mm_sub_ps(t2,t8);
+    _mm_store_ss(ptrB,t2);
+    _mm_storeh_pi((__m64 *)(ptrB+1),t2);
+    t3          = _mm_load_ss(ptrC);
+    t3          = _mm_loadh_pi(t3,(__m64 *)(ptrC+1));
+    t3          = _mm_sub_ps(t3,t9);
+    _mm_store_ss(ptrC,t3);
+    _mm_storeh_pi((__m64 *)(ptrC+1),t3);
+    t4          = _mm_load_ss(ptrD);
+    t4          = _mm_loadh_pi(t4,(__m64 *)(ptrD+1));
+    t4          = _mm_sub_ps(t4,t10);
+    _mm_store_ss(ptrD,t4);
+    _mm_storeh_pi((__m64 *)(ptrD+1),t4);
+}
+
+
+
+static void
+gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
+                                       __m128 x1, __m128 y1, __m128 z1,
+                                       __m128 x2, __m128 y2, __m128 z2,
+                                       __m128 x3, __m128 y3, __m128 z3) 
+{
+    __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+    __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
+    __m128 t20,t21,t22,t23,t24,t25;
+
+    t13         = _mm_unpackhi_ps(x1,y1);
+    x1          = _mm_unpacklo_ps(x1,y1);
+    t14         = _mm_unpackhi_ps(z1,x2);
+    z1          = _mm_unpacklo_ps(z1,x2);
+    t15         = _mm_unpackhi_ps(y2,z2);
+    y2          = _mm_unpacklo_ps(y2,z2);
+    t16         = _mm_unpackhi_ps(x3,y3);
+    x3          = _mm_unpacklo_ps(x3,y3);
+    t17         = _mm_shuffle_ps(z3,z3,_MM_SHUFFLE(0,0,0,1));
+    t18         = _mm_movehl_ps(z3,z3);
+    t19         = _mm_shuffle_ps(t18,t18,_MM_SHUFFLE(0,0,0,1));
+    t20         = _mm_movelh_ps(x1,z1);
+    t21         = _mm_movehl_ps(z1,x1);
+    t22         = _mm_movelh_ps(t13,t14);
+    t14         = _mm_movehl_ps(t14,t13);
+    t23         = _mm_movelh_ps(y2,x3);
+    t24         = _mm_movehl_ps(x3,y2);
+    t25         = _mm_movelh_ps(t15,t16);
+    t16         = _mm_movehl_ps(t16,t15);
+    t1          = _mm_loadu_ps(ptrA);
+    t2          = _mm_loadu_ps(ptrA+4);
+    t3          = _mm_load_ss(ptrA+8);
+    t1          = _mm_sub_ps(t1,t20);
+    t2          = _mm_sub_ps(t2,t23);
+    t3          = _mm_sub_ss(t3,z3);
+    _mm_storeu_ps(ptrA,t1);
+    _mm_storeu_ps(ptrA+4,t2);
+    _mm_store_ss(ptrA+8,t3);
+    t4          = _mm_loadu_ps(ptrB);
+    t5          = _mm_loadu_ps(ptrB+4);
+    t6          = _mm_load_ss(ptrB+8);
+    t4          = _mm_sub_ps(t4,t21);
+    t5          = _mm_sub_ps(t5,t24);
+    t6          = _mm_sub_ss(t6,t17);
+    _mm_storeu_ps(ptrB,t4);
+    _mm_storeu_ps(ptrB+4,t5);
+    _mm_store_ss(ptrB+8,t6);
+    t7          = _mm_loadu_ps(ptrC);
+    t8          = _mm_loadu_ps(ptrC+4);
+    t9          = _mm_load_ss(ptrC+8);
+    t7          = _mm_sub_ps(t7,t22);
+    t8          = _mm_sub_ps(t8,t25);
+    t9          = _mm_sub_ss(t9,t18);
+    _mm_storeu_ps(ptrC,t7);
+    _mm_storeu_ps(ptrC+4,t8);
+    _mm_store_ss(ptrC+8,t9);
+    t10         = _mm_loadu_ps(ptrD);
+    t11         = _mm_loadu_ps(ptrD+4);
+    t12         = _mm_load_ss(ptrD+8);
+    t10         = _mm_sub_ps(t10,t14);
+    t11         = _mm_sub_ps(t11,t16);
+    t12         = _mm_sub_ss(t12,t19);
+    _mm_storeu_ps(ptrD,t10);
+    _mm_storeu_ps(ptrD+4,t11);
+    _mm_store_ss(ptrD+8,t12);
+}
+
+
+static void
+gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
+                                       __m128 x1, __m128 y1, __m128 z1,
+                                       __m128 x2, __m128 y2, __m128 z2,
+                                       __m128 x3, __m128 y3, __m128 z3,
+                                       __m128 x4, __m128 y4, __m128 z4) 
+{
+    __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
+    __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
+    __m128 t23,t24;
+    t13         = _mm_unpackhi_ps(x1,y1);
+    x1          = _mm_unpacklo_ps(x1,y1);
+    t14         = _mm_unpackhi_ps(z1,x2);
+    z1          = _mm_unpacklo_ps(z1,x2);
+    t15         = _mm_unpackhi_ps(y2,z2);
+    y2          = _mm_unpacklo_ps(y2,z2);
+    t16         = _mm_unpackhi_ps(x3,y3);
+    x3          = _mm_unpacklo_ps(x3,y3);
+    t17         = _mm_unpackhi_ps(z3,x4);
+    z3          = _mm_unpacklo_ps(z3,x4);
+    t18         = _mm_unpackhi_ps(y4,z4);
+    y4          = _mm_unpacklo_ps(y4,z4);
+    t19         = _mm_movelh_ps(x1,z1);
+    z1          = _mm_movehl_ps(z1,x1);
+    t20         = _mm_movelh_ps(t13,t14);
+    t14         = _mm_movehl_ps(t14,t13);
+    t21         = _mm_movelh_ps(y2,x3);
+    x3          = _mm_movehl_ps(x3,y2);
+    t22         = _mm_movelh_ps(t15,t16);
+    t16         = _mm_movehl_ps(t16,t15);
+    t23         = _mm_movelh_ps(z3,y4);
+    y4          = _mm_movehl_ps(y4,z3);
+    t24         = _mm_movelh_ps(t17,t18);
+    t18         = _mm_movehl_ps(t18,t17);
+    t1          = _mm_loadu_ps(ptrA);
+    t2          = _mm_loadu_ps(ptrA+4);
+    t3          = _mm_loadu_ps(ptrA+8);
+    t1          = _mm_sub_ps(t1,t19);
+    t2          = _mm_sub_ps(t2,t21);
+    t3          = _mm_sub_ps(t3,t23);
+    _mm_storeu_ps(ptrA,t1);
+    _mm_storeu_ps(ptrA+4,t2);
+    _mm_storeu_ps(ptrA+8,t3);
+    t4          = _mm_loadu_ps(ptrB);
+    t5          = _mm_loadu_ps(ptrB+4);
+    t6          = _mm_loadu_ps(ptrB+8);
+    t4          = _mm_sub_ps(t4,z1);
+    t5          = _mm_sub_ps(t5,x3);
+    t6          = _mm_sub_ps(t6,y4);
+    _mm_storeu_ps(ptrB,t4);
+    _mm_storeu_ps(ptrB+4,t5);
+    _mm_storeu_ps(ptrB+8,t6);
+    t7          = _mm_loadu_ps(ptrC);
+    t8          = _mm_loadu_ps(ptrC+4);
+    t9          = _mm_loadu_ps(ptrC+8);
+    t7          = _mm_sub_ps(t7,t20);
+    t8          = _mm_sub_ps(t8,t22);
+    t9          = _mm_sub_ps(t9,t24);
+    _mm_storeu_ps(ptrC,t7);
+    _mm_storeu_ps(ptrC+4,t8);
+    _mm_storeu_ps(ptrC+8,t9);
+    t10         = _mm_loadu_ps(ptrD);
+    t11         = _mm_loadu_ps(ptrD+4);
+    t12         = _mm_loadu_ps(ptrD+8);
+    t10         = _mm_sub_ps(t10,t14);
+    t11         = _mm_sub_ps(t11,t16);
+    t12         = _mm_sub_ps(t12,t18);
+    _mm_storeu_ps(ptrD,t10);
+    _mm_storeu_ps(ptrD+4,t11);
+    _mm_storeu_ps(ptrD+8,t12);
+}
+
+
+
+static gmx_inline void
+gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
+                                      float *fptr,
+                                      float *fshiftptr)
+{
+    __m128 t1,t2,t3;
+
+    /* transpose data */
+    t1 = fix1;
+    _MM_TRANSPOSE4_PS(fix1,t1,fiy1,fiz1);
+    fix1 = _mm_add_ps(_mm_add_ps(fix1,t1), _mm_add_ps(fiy1,fiz1));
+
+    t2 = _mm_load_ss(fptr);
+    t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
+    t3 = _mm_load_ss(fshiftptr);
+    t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
+
+    t2 = _mm_add_ps(t2,fix1);
+    t3 = _mm_add_ps(t3,fix1);
+
+    _mm_store_ss(fptr,t2);
+    _mm_storeh_pi((__m64 *)(fptr+1),t2);
+    _mm_store_ss(fshiftptr,t3);
+    _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
+}
+
+static gmx_inline void
+gmx_mm_update_iforce_2atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
+                                      __m128 fix2, __m128 fiy2, __m128 fiz2,
+                                      float *fptr,
+                                      float *fshiftptr)
+{
+    __m128 t1,t2,t4;
+
+    /* transpose data */
+    _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
+    t1 = _mm_unpacklo_ps(fiy2,fiz2);
+    t2 = _mm_unpackhi_ps(fiy2,fiz2);
+
+    fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
+    t1   = _mm_add_ps(t1,t2);
+    t2   = _mm_movehl_ps(t2,t1);
+    fiy2 = _mm_add_ps(t1,t2);
+
+    _mm_storeu_ps(fptr,   _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
+    t1 = _mm_loadl_pi(t1,(__m64 *)(fptr+4));
+    _mm_storel_pi((__m64 *)(fptr+4), _mm_add_ps(fiy2,t1));
+
+    t4 = _mm_load_ss(fshiftptr+2);
+    t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
+
+    t1 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,2));   /* fiy2  -   fix2 fiz1 */
+    t1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,1,0,0));       /* fiy2 fix2  -   fiz1 */
+    t2 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(1,0,0,1));   /* fiy1 fix1  -   fiz2 */
+
+    t1 = _mm_add_ps(t1,t2);
+    t1 = _mm_add_ps(t1,t4); /* y x - z */
+
+    _mm_store_ss(fshiftptr+2,t1);
+    _mm_storeh_pi((__m64 *)(fshiftptr),t1);
+}
+
+
+
+static gmx_inline void
+gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
+                                      __m128 fix2, __m128 fiy2, __m128 fiz2,
+                                      __m128 fix3, __m128 fiy3, __m128 fiz3,
+                                      float *fptr,
+                                      float *fshiftptr)
+{
+    __m128 t1,t2,t3,t4;
+
+    /* transpose data */
+    _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
+    _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
+    t2   = _mm_movehl_ps(_mm_setzero_ps(),fiz3);
+    t1   = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(0,0,0,1));
+    t3   = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,1));
+
+    fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
+    fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
+    fiz3 = _mm_add_ss(_mm_add_ps(fiz3,t1)  , _mm_add_ps(t2,t3));
+
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
+    _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
+
+    t4 = _mm_load_ss(fshiftptr+2);
+    t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
+
+    t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));   /* fiy1 fix1  -   fiz3 */
+    t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));   /* fiy3 fix3  -   fiz1 */
+    t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));   /* fix2 fix2 fiy2 fiz2 */
+    t3 = _mm_shuffle_ps(t3  ,t3  ,_MM_SHUFFLE(1,2,0,0));   /* fiy2 fix2  -   fiz2 */
+
+    t1 = _mm_add_ps(t1,t2);
+    t3 = _mm_add_ps(t3,t4);
+    t1 = _mm_add_ps(t1,t3); /* y x - z */
+
+    _mm_store_ss(fshiftptr+2,t1);
+    _mm_storeh_pi((__m64 *)(fshiftptr),t1);
+}
+
+
+static gmx_inline void
+gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
+                                      __m128 fix2, __m128 fiy2, __m128 fiz2,
+                                      __m128 fix3, __m128 fiy3, __m128 fiz3,
+                                      __m128 fix4, __m128 fiy4, __m128 fiz4,
+                                      float *fptr,
+                                      float *fshiftptr)
+{
+    __m128 t1,t2,t3,t4,t5;
+
+    /* transpose data */
+    _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
+    _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
+    _MM_TRANSPOSE4_PS(fiz3,fix4,fiy4,fiz4);
+
+    fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
+    fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
+    fiz3 = _mm_add_ps(_mm_add_ps(fiz3,fix4), _mm_add_ps(fiy4,fiz4));
+
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
+    _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
+
+    t5 = _mm_load_ss(fshiftptr+2);
+    t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
+
+    t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));
+    t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));
+    t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));
+    t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
+    t4 = _mm_shuffle_ps(fiz3,t4  ,_MM_SHUFFLE(2,0,3,3));
+
+    t1 = _mm_add_ps(t1,t2);
+    t3 = _mm_add_ps(t3,t4);
+    t1 = _mm_add_ps(t1,t3);
+    t5 = _mm_add_ps(t5,t1);
+
+    _mm_store_ss(fshiftptr+2,t5);
+    _mm_storeh_pi((__m64 *)(fshiftptr),t5);
+}
+
+
+
+static void
+gmx_mm_update_1pot_ps(__m128 pot1, float *ptrA)
+{
+    pot1 = _mm_add_ps(pot1,_mm_movehl_ps(_mm_setzero_ps(),pot1));
+    pot1 = _mm_add_ps(pot1,_mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1)));
+    _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
+}
+
+static void
+gmx_mm_update_2pot_ps(__m128 pot1, float *ptrA,
+                      __m128 pot2, float *ptrB)
+{
+    __m128 t1,t2;
+    t1   = _mm_movehl_ps(pot2,pot1);
+    t2   = _mm_movelh_ps(pot1,pot2);
+    t1   = _mm_add_ps(t1,t2);
+    t2   = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
+    pot1 = _mm_add_ps(t1,t2);
+    pot2 = _mm_movehl_ps(t2,pot1);
+    _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
+    _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
+}
+
+
+static void
+gmx_mm_update_4pot_ps(__m128 pot1, float *ptrA,
+                      __m128 pot2, float *ptrB,
+                      __m128 pot3, float *ptrC,
+                      __m128 pot4, float *ptrD)
+{
+    _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
+    pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
+    pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
+    pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
+    pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
+    _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
+    _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
+    _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
+    _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
+}
+
+
+#endif /* _kernelutil_x86_sse2_single_h_ */
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/make_nb_kernel_sse2_single.py b/src/gmxlib/nonbonded/nb_kernel_sse2_single/make_nb_kernel_sse2_single.py
new file mode 100755 (executable)
index 0000000..d810bde
--- /dev/null
@@ -0,0 +1,489 @@
+#!/usr/bin/python
+
+import sys
+import os
+sys.path.append ( "../preprocessor" )
+from gmxpreprocess import gmxpreprocess
+
+# "The happiest programs are programs that write other programs."
+#
+#
+# This script controls the generation of Gromacs nonbonded kernels.
+#
+# We no longer generate kernels on-the-fly, so this file is not run
+# during a Gromacs compile - only when we need to update the kernels (=rarely).
+#
+# To maximize performance, each combination of interactions in Gromacs
+# has a separate nonbonded kernel without conditionals in the code.
+# To avoid writing hundreds of different routines for each architecture,
+# we instead use a custom preprocessor so we can encode the conditionals
+# and expand for-loops (e.g, for water-water interactions)
+# from a general kernel template. While that file will contain quite a
+# few preprocessor directives, it is still an order of magnitude easier
+# to maintain than ~200 different kernels (not to mention it avoids bugs).
+#
+# To actually generate the kernels, this program iteratively calls the
+# preprocessor with different define settings corresponding to all
+# combinations of coulomb/van-der-Waals/geometry options.
+#
+# A main goal in the design was to make this new generator _general_. For
+# this reason we have used a lot of different fields to identify a particular
+# kernel and interaction. Basically, each kernel will have a name like
+#
+# nbkernel_ElecXX_VdwYY_GeomZZ_VF_QQ()
+#
+# Where XX/YY/ZZ/VF are strings to identify what the kernel computes.
+#
+# Elec/Vdw describe the type of interaction for electrostatics and van der Waals.
+# The geometry settings correspond e.g. to water-water or water-particle kernels,
+# and finally the VF setting is V,F,or VF depending on whether we calculate
+# only the potential, only the force, or both of them. The final string (QQ)
+# is the architecture/language/optimization of the kernel.
+#
+Arch       = 'sse2_single'
+
+# Explanation of the 'properties':
+#
+# It is cheap to compute r^2, and the kernels require various other functions of r for
+# different kinds of interaction. Depending on the needs of the kernel and the available
+# processor instructions, this will be done in different ways.
+#
+# 'rinv' means we need 1/r, which is calculated as 1/sqrt(r^2).
+# 'rinvsq' means we need 1/(r*r). This is calculated as rinv*rinv if we already did rinv, otherwise 1/r^2.
+# 'r' is similarly calculated as r^2*rinv when needed
+# 'table' means the interaction is tabulated, in which case we will calculate a table index before the interaction
+# 'shift' means the interaction will be modified by a constant to make it zero at the cutoff.
+# 'cutoff' means the interaction is set to 0.0 outside the cutoff
+#
+
+FileHeader = \
+'/*\n' \
+' * Note: this file was generated by the Gromacs '+Arch+' kernel generator.\n' \
+' *\n' \
+' *                This source code is part of\n' \
+' *\n' \
+' *                 G   R   O   M   A   C   S\n' \
+' *\n' \
+' * Copyright (c) 2001-2012, The GROMACS Development Team\n' \
+' *\n' \
+' * Gromacs is a library for molecular simulation and trajectory analysis,\n' \
+' * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for\n' \
+' * a full list of developers and information, check out http://www.gromacs.org\n' \
+' *\n' \
+' * This program is free software; you can redistribute it and/or modify it under\n' \
+' * the terms of the GNU Lesser General Public License as published by the Free\n' \
+' * Software Foundation; either version 2 of the License, or (at your option) any\n' \
+' * later version.\n' \
+' *\n' \
+' * To help fund GROMACS development, we humbly ask that you cite\n' \
+' * the papers people have written on it - you can find them on the website.\n' \
+' */\n'
+
+###############################################
+# ELECTROSTATICS
+# Interactions and flags for them
+###############################################
+ElectrostaticsList = {
+    'None'                    : [],
+    'Coulomb'                 : ['rinv','rinvsq'],
+    'ReactionField'           : ['rinv','rinvsq'],
+    'GeneralizedBorn'         : ['rinv','r'],
+    'CubicSplineTable'        : ['rinv','r','table'],
+    'Ewald'                   : ['rinv','rinvsq','r'],
+}
+
+
+###############################################
+# VAN DER WAALS
+# Interactions and flags for them
+###############################################
+VdwList = {
+    'None'                    : [],
+    'LennardJones'            : ['rinvsq'],
+#    'Buckingham'              : ['rinv','rinvsq','r'], # Disabled for sse2 to reduce number of kernels and simply the template
+    'CubicSplineTable'        : ['rinv','r','table'],
+}
+
+
+###############################################
+# MODIFIERS
+# Different ways to adjust/modify interactions to conserve energy
+###############################################
+ModifierList = {
+    'None'                    : [],
+    'ExactCutoff'             : ['exactcutoff'],        # Zero the interaction outside the cutoff, used for reaction-field-zero
+    'PotentialShift'          : ['shift','exactcutoff'],
+    'PotentialSwitch'         : ['rinv','r','switch','exactcutoff']
+}
+
+
+###############################################
+# GEOMETRY COMBINATIONS
+###############################################
+GeometryNameList = [
+    [ 'Particle' , 'Particle' ],
+    [ 'Water3'   , 'Particle' ],
+    [ 'Water3'   , 'Water3'   ],
+    [ 'Water4'   , 'Particle' ],
+    [ 'Water4'   , 'Water4'   ]
+]
+
+
+###############################################
+# POTENTIAL / FORCE
+###############################################
+VFList = [
+    'PotentialAndForce',
+# 'Potential',   # Not used yet
+    'Force'
+]
+
+
+###############################################
+# GEOMETRY PROPERTIES
+###############################################
+# Dictionaries with lists telling which interactions are present
+# 1,2,3 means particles 1,2,3 (but not 0) have electrostatics!
+GeometryElectrostatics = {
+    'Particle'  : [ 0 ],
+    'Particle2' : [ 0 , 1 ],
+    'Particle3' : [ 0 , 1 , 2 ],
+    'Particle4' : [ 0 , 1 , 2 , 3 ],
+    'Water3'    : [ 0 , 1 , 2 ],
+    'Water4'    : [ 1 , 2 , 3 ]
+}
+
+GeometryVdw = {
+    'Particle'  : [ 0 ],
+    'Particle2' : [ 0 , 1 ],
+    'Particle3' : [ 0 , 1 , 2 ],
+    'Particle4' : [ 0 , 1 , 2 , 3 ],
+    'Water3'    : [ 0 ],
+    'Water4'    : [ 0 ]
+}
+
+
+
+
+# Dictionary to abbreviate all strings (mixed from all the lists)
+Abbreviation = {
+    'None'                    : 'None',
+    'Coulomb'                 : 'Coul',
+    'Ewald'                   : 'Ew',
+    'ReactionField'           : 'RF',
+    'GeneralizedBorn'         : 'GB',
+    'CubicSplineTable'        : 'CSTab',
+    'LennardJones'            : 'LJ',
+    'Buckingham'              : 'Bham',
+    'PotentialShift'          : 'Sh',
+    'PotentialSwitch'         : 'Sw',
+    'ExactCutoff'             : 'Cut',
+    'PotentialAndForce'       : 'VF',
+    'Potential'               : 'V',
+    'Force'                   : 'F',
+    'Water3'                  : 'W3',
+    'Water4'                  : 'W4',
+    'Particle'                : 'P1',
+    'Particle2'               : 'P2',
+    'Particle3'               : 'P3',
+    'Particle4'               : 'P4'
+}
+
+
+###############################################
+# Functions
+###############################################
+
+# Return a string with the kernel name from current settings
+def MakeKernelFileName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom):
+    ElecStr = 'Elec' + Abbreviation[KernelElec]
+    if(KernelElecMod!='None'):
+        ElecStr = ElecStr + Abbreviation[KernelElecMod]
+    VdwStr  = 'Vdw'  + Abbreviation[KernelVdw]
+    if(KernelVdwMod!='None'):
+        VdwStr = VdwStr + Abbreviation[KernelVdwMod]
+    GeomStr = 'Geom' + Abbreviation[KernelGeom[0]] + Abbreviation[KernelGeom[1]]
+    return 'nb_kernel_' + ElecStr + '_' + VdwStr + '_' + GeomStr + '_' + Arch
+
+def MakeKernelName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF):
+    ElecStr = 'Elec' + Abbreviation[KernelElec]
+    if(KernelElecMod!='None'):
+        ElecStr = ElecStr + Abbreviation[KernelElecMod]
+    VdwStr  = 'Vdw'  + Abbreviation[KernelVdw]
+    if(KernelVdwMod!='None'):
+        VdwStr = VdwStr + Abbreviation[KernelVdwMod]
+    GeomStr = 'Geom' + Abbreviation[KernelGeom[0]] + Abbreviation[KernelGeom[1]]
+    VFStr   = Abbreviation[KernelVF]
+    return 'nb_kernel_' + ElecStr + '_' + VdwStr + '_' + GeomStr + '_' + VFStr + '_' + Arch
+
+# Return a string with a declaration to use for the kernel;
+# this will be a sequence of string combinations as well as the actual function name
+# Dont worry about field widths - that is just pretty-printing for the header!
+def MakeKernelDecl(KernelName,KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelOther,KernelVF):
+    KernelStr   = '\"'+KernelName+'\"'
+    ArchStr     = '\"'+Arch+'\"'
+    ElecStr     = '\"'+KernelElec+'\"'
+    ElecModStr  = '\"'+KernelElecMod+'\"'
+    VdwStr      = '\"'+KernelVdw+'\"'
+    VdwModStr   = '\"'+KernelVdwMod+'\"'
+    GeomStr     = '\"'+KernelGeom[0]+KernelGeom[1]+'\"'
+    OtherStr    = '\"'+KernelOther+'\"'
+    VFStr       = '\"'+KernelVF+'\"'
+
+    ThisSpec = ArchStr+', '+ElecStr+', '+ElecModStr+', '+VdwStr+', '+VdwModStr+', '+GeomStr+', '+OtherStr+', '+VFStr
+    ThisDecl = '    { '+KernelName+', '+KernelStr+', '+ThisSpec+' }'
+    return ThisDecl
+
+
+# Returns 1 if this kernel should be created, 0 if we should skip it
+# This routine is not critical - it is not the end of the world if we create more kernels,
+# but since the number is pretty large we save both space and compile-time by reducing it a bit.
+def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF):
+
+    # No need for kernels without interactions
+    if(KernelElec=='None' and KernelVdw=='None'):
+        return 0
+
+    # No need for modifiers without interactions
+    if((KernelElec=='None' and KernelElecMod!='None') or (KernelVdw=='None' and KernelVdwMod!='None')):
+        return 0
+
+    # No need for LJ-only water optimization, or water optimization with implicit solvent.
+    if('Water' in KernelGeom[0] and (KernelElec=='None' or 'GeneralizedBorn' in KernelElec)):
+        return 0
+
+    # Non-matching table settings are pointless
+    if( ('Table' in KernelElec) and ('Table' in KernelVdw) and KernelElec!=KernelVdw ):
+        return 0
+
+    # Try to reduce the number of different switch/shift options to get a reasonable number of kernels
+    # For electrostatics, reaction-field can use 'exactcutoff', and ewald can use switch or shift.
+    if(KernelElecMod=='ExactCutoff' and KernelElec!='ReactionField'):
+        return 0
+    if(KernelElecMod in ['PotentialShift','PotentialSwitch'] and KernelElec!='Ewald'):
+        return 0
+    # For Vdw, we support switch and shift for Lennard-Jones/Buckingham
+    if((KernelVdwMod=='ExactCutoff') or
+       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham'])):
+        return 0
+
+    # Choose either switch or shift and don't mix them...
+    if((KernelElecMod=='PotentialShift' and KernelVdwMod=='PotentialSwitch') or
+       (KernelElecMod=='PotentialSwitch' and KernelVdwMod=='PotentialShift')):
+        return 0
+
+    # Don't use a Vdw kernel with a modifier if the electrostatics one does not have one
+    if(KernelElec!='None' and KernelElecMod=='None' and KernelVdwMod!='None'):
+        return 0
+
+    # Don't use an electrostatics kernel with a modifier if the vdw one does not have one,
+    # unless the electrostatics one is reaction-field with exact cutoff.
+    if(KernelVdw!='None' and KernelVdwMod=='None' and KernelElecMod!='None'):
+        if(KernelElec=='ReactionField' and KernelVdw!='CubicSplineTable'):
+            return 0
+        elif(KernelElec!='ReactionField'):
+            return 0
+
+    return 1
+
+
+
+#
+# The preprocessor will automatically expand the interactions for water and other
+# geometries inside the kernel, but to get this right we need to setup a couple
+# of defines - we do them in a separate routine to keep the main loop clean.
+#
+# While this routine might look a bit complex it is actually quite straightforward,
+# and the best news is that you wont have to modify _anything_ for a new geometry
+# as long as you correctly define its Electrostatics/Vdw geometry in the lists above!
+#
+def SetDefines(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF,defines):
+    # What is the _name_ for the i/j group geometry?
+    igeometry            = KernelGeom[0]
+    jgeometry            = KernelGeom[1]
+    # define so we can access it in the source when the preprocessor runs
+    defines['GEOMETRY_I'] = igeometry
+    defines['GEOMETRY_J'] = jgeometry
+
+    # For the i/j groups, extract a python list of which sites have electrostatics
+    # For SPC/TIP3p this will be [1,1,1], while TIP4p (no elec on first site) will be [0,1,1,1]
+    ielec                = GeometryElectrostatics[igeometry]
+    jelec                = GeometryElectrostatics[jgeometry]
+    # Zero out the corresponding lists in case we dont do Elec
+    if(KernelElec=='None'):
+        ielec = []
+        jelec = []
+
+    # Extract similar interaction lists for Vdw interactions (example for SPC: [1,0,0])
+    iVdw                 = GeometryVdw[igeometry]
+    jVdw                 = GeometryVdw[jgeometry]
+
+    # Zero out the corresponding lists in case we dont do Vdw
+    if(KernelVdw=='None'):
+        iVdw = []
+        jVdw = []
+
+    # iany[] and jany[] contains lists of the particles actually used (for interactions) in this kernel
+    iany = list(set(ielec+iVdw))  # convert to+from set to make elements unique
+    jany = list(set(jelec+jVdw))
+
+    defines['PARTICLES_ELEC_I'] = ielec
+    defines['PARTICLES_ELEC_J'] = jelec
+    defines['PARTICLES_VDW_I']  = iVdw
+    defines['PARTICLES_VDW_J']  = jVdw
+    defines['PARTICLES_I']      = iany
+    defines['PARTICLES_J']      = jany
+
+    # elecij,Vdwij are sets with pairs of particles for which the corresponding interaction is done
+    # (and anyij again corresponds to either electrostatics or Vdw)
+    elecij = []
+    Vdwij  = []
+    anyij  = []
+
+    for i in ielec:
+        for j in jelec:
+            elecij.append([i,j])
+
+    for i in iVdw:
+        for j in jVdw:
+            Vdwij.append([i,j])
+
+    for i in iany:
+        for j in jany:
+            if [i,j] in elecij or [i,j] in Vdwij:
+                anyij.append([i,j])
+
+    defines['PAIRS_IJ']     = anyij
+
+    # Make an 2d list-of-distance-properties-to-calculate for i,j
+    ni = max(iany)+1
+    nj = max(jany)+1
+    # Each element properties[i][j] is an empty list
+    properties = [ [ [] for j in range(0,nj) ] for i in range (0,ni) ]
+    # Add properties to each set
+    for i in range(0,ni):
+        for j in range(0,nj):
+            if [i,j] in elecij:
+                properties[i][j] = properties[i][j] + ['electrostatics'] + ElectrostaticsList[KernelElec] + ModifierList[KernelElecMod]
+            if [i,j] in Vdwij:
+                properties[i][j] = properties[i][j] + ['vdw'] + VdwList[KernelVdw] + ModifierList[KernelVdwMod]
+            # Add rinv if we need r
+            if 'r' in properties[i][j]:
+                properties[i][j] = properties[i][j] + ['rinv']
+            # Add rsq if we need rinv or rinsq
+            if 'rinv' in properties[i][j] or 'rinvsq' in properties[i][j]:
+                properties[i][j] = properties[i][j] + ['rsq']
+
+    defines['INTERACTION_FLAGS']    = properties
+
+
+
+def PrintStatistics(ratio):
+    ratio = 100.0*ratio
+    print '\rGenerating %s nonbonded kernels... %5.1f%%' % (Arch,ratio),
+    sys.stdout.flush()
+
+
+
+defines = {}
+kerneldecl = []
+
+cnt     = 0.0
+nelec   = len(ElectrostaticsList)
+nVdw    = len(VdwList)
+nmod    = len(ModifierList)
+ngeom   = len(GeometryNameList)
+
+ntot    = nelec*nmod*nVdw*nmod*ngeom
+
+numKernels = 0
+
+fpdecl = open('nb_kernel_' + Arch + '.c','w')
+fpdecl.write( FileHeader )
+fpdecl.write( '#ifndef nb_kernel_' + Arch + '_h\n' )
+fpdecl.write( '#define nb_kernel_' + Arch + '_h\n\n' )
+fpdecl.write( '#include "../nb_kernel.h"\n\n' )
+
+for KernelElec in ElectrostaticsList:
+    defines['KERNEL_ELEC'] = KernelElec
+
+    for KernelElecMod in ModifierList:
+        defines['KERNEL_MOD_ELEC'] = KernelElecMod
+
+        for KernelVdw in VdwList:
+            defines['KERNEL_VDW'] = KernelVdw
+
+            for KernelVdwMod in ModifierList:
+                defines['KERNEL_MOD_VDW'] = KernelVdwMod
+
+                for KernelGeom in GeometryNameList:
+
+                    cnt += 1
+                    KernelFilename = MakeKernelFileName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom) + '.c'
+                    fpkernel = open(KernelFilename,'w')
+                    defines['INCLUDE_HEADER'] = 1  # Include header first time in new file
+                    DoHeader = 1
+
+                    for KernelVF in VFList:
+
+                        KernelName = MakeKernelName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF)
+
+                        defines['KERNEL_NAME'] = KernelName
+                        defines['KERNEL_VF']   = KernelVF
+
+                        # Check if this is a valid/sane/usable combination
+                        if not KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF):
+                            continue;
+
+                        # The overall kernel settings determine what the _kernel_ calculates, but for the water
+                        # kernels this does not mean that every pairwise interaction has e.g. Vdw interactions.
+                        # This routine sets defines of what to calculate for each pair of particles in those cases.
+                        SetDefines(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF,defines)
+
+                        if(DoHeader==1):
+                            fpkernel.write( FileHeader )
+
+                        gmxpreprocess('nb_kernel_template_' + Arch + '.pre', KernelName+'.tmp' , defines, force=1,contentType='C')
+                        numKernels = numKernels + 1
+
+                        defines['INCLUDE_HEADER'] = 0   # Header has been included once now
+                        DoHeader=0
+
+                        # Append temp file contents to the common kernelfile
+                        fptmp = open(KernelName+'.tmp','r')
+                        fpkernel.writelines(fptmp.readlines())
+                        fptmp.close()
+                        os.remove(KernelName+'.tmp')
+
+                        # Add a declaration for this kernel
+                        fpdecl.write('nb_kernel_t ' + KernelName + ';\n');
+
+                        # Add declaration to the buffer
+                        KernelOther=''
+                        kerneldecl.append(MakeKernelDecl(KernelName,KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelOther,KernelVF))
+
+                    filesize = fpkernel.tell()
+                    fpkernel.close()
+                    if(filesize==0):
+                        os.remove(KernelFilename)
+
+                    PrintStatistics(cnt/ntot)
+                pass
+            pass
+        pass
+    pass
+pass
+
+# Write out the list of settings and corresponding kernels to the declaration file
+fpdecl.write( '\n\n' )
+fpdecl.write( 'nb_kernel_info_t\n' )
+fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+fpdecl.write( '{\n' )
+for decl in kerneldecl[0:-1]:
+    fpdecl.write( decl + ',\n' )
+fpdecl.write( kerneldecl[-1] + '\n' )
+fpdecl.write( '};\n\n' )
+fpdecl.write( 'int\n' )
+fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+fpdecl.write( '#endif\n')
+fpdecl.close()
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..4ecf62d
--- /dev/null
@@ -0,0 +1,802 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 73 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 74 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 12 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*74);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 61 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 62 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*62);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..f2206de
--- /dev/null
@@ -0,0 +1,1254 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 159 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 162 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*162);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 139 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 142 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*142);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..6ea9d6c
--- /dev/null
@@ -0,0 +1,2432 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r01,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq01,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r02,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq02,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq11,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq12,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq21,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq22,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 417 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r01,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq01,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r02,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq02,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq11,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq12,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq21,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq22,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 426 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*426);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r01,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r02,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 373 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r01,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r02,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 382 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*382);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..3ed4b97
--- /dev/null
@@ -0,0 +1,1422 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r30,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq30,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 185 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r30,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq30,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 189 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*189);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r30,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 165 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r30,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 169 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*169);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..b643300
--- /dev/null
@@ -0,0 +1,2616 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq11,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq12,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r13,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq13,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq21,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq22,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r23,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq23,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r31,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq31,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r32,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq32,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r33,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq33,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 446 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq11,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq12,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r13,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq13,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq21,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq22,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r23,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq23,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r31,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq31,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r32,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq32,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r33,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq33,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 456 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*456);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r13,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r23,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r31,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r32,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r33,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 402 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r13,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r23,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r31,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r32,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r33,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 412 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*412);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..f06c4c3
--- /dev/null
@@ -0,0 +1,726 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 56 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 57 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 12 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*57);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 47 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 48 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*48);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..e78f14b
--- /dev/null
@@ -0,0 +1,1178 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 142 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 145 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*145);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 125 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 128 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*128);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..dc10947
--- /dev/null
@@ -0,0 +1,2356 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r01,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq01,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r02,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq02,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq11,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq12,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq21,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq22,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 400 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r01,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq01,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r02,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq02,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq11,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq12,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq21,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq22,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 409 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*409);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r01,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r02,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 359 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r01,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r02,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 368 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*368);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..b07cf7b
--- /dev/null
@@ -0,0 +1,1308 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r30,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq30,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 161 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r30,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq30,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 164 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*164);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r30,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 144 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r30,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 147 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*147);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..945c289
--- /dev/null
@@ -0,0 +1,2502 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq11,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq12,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r13,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq13,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq21,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq22,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r23,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq23,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r31,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq31,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r32,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq32,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r33,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq33,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 422 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq11,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq12,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r13,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq13,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq21,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq22,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r23,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq23,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r31,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq31,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r32,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq32,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r33,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq33,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 431 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*431);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r13,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r23,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r31,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r32,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r33,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 381 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r13,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r23,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r31,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r32,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r33,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 390 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*390);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..d152ac2
--- /dev/null
@@ -0,0 +1,631 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 43 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 44 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 11 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*44);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 39 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 40 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*40);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..eb8ea26
--- /dev/null
@@ -0,0 +1,1083 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 132 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*132);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 117 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*120);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..63788ec
--- /dev/null
@@ -0,0 +1,2291 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    qq00             = _mm_mul_ps(iq0,jq0);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r01,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq01,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r02,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq02,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq11,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq12,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq21,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq22,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 387 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r01,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq01,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r02,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq02,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq11,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq12,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq21,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq22,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 396 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*396);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    qq00             = _mm_mul_ps(iq0,jq0);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r01,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r02,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r01,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r02,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 360 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*360);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..79f91ec
--- /dev/null
@@ -0,0 +1,1083 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r30,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq30,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq10,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq20,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r30,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq30,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 132 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*132);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r30,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 117 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r10,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r20,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r30,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq30,FF),_mm_mul_ps(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*120);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..c72a4a5
--- /dev/null
@@ -0,0 +1,2291 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq11,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq12,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r13,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq13,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq21,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq22,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r23,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq23,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r31,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq31,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r32,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq32,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r33,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq33,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 387 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq11,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq12,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r13,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq13,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq21,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq22,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r23,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq23,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r31,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq31,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r32,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq32,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r33,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq33,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 396 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*396);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r13,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r23,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r31,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r32,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r33,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r11,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r12,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r13,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r21,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r22,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r23,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r31,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r32,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r33,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,2);
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 360 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*360);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..12c16ba
--- /dev/null
@@ -0,0 +1,774 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 63 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 64 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 12 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*64);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 54 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 55 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*55);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..776bd4b
--- /dev/null
@@ -0,0 +1,1102 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 119 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*120);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 109 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*109);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..8440c24
--- /dev/null
@@ -0,0 +1,1908 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,rinv01);
+            felec            = _mm_mul_ps(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,rinv02);
+            felec            = _mm_mul_ps(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 287 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,rinv01);
+            felec            = _mm_mul_ps(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,rinv02);
+            felec            = _mm_mul_ps(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 288 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*288);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,rinv01);
+            felec            = _mm_mul_ps(velec,rinvsq01);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,rinv02);
+            felec            = _mm_mul_ps(velec,rinvsq02);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,rinv01);
+            felec            = _mm_mul_ps(velec,rinvsq01);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,rinv02);
+            felec            = _mm_mul_ps(velec,rinvsq02);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 271 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*271);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..1aa4960
--- /dev/null
@@ -0,0 +1,1236 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,rinv30);
+            felec            = _mm_mul_ps(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 140 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,rinv30);
+            felec            = _mm_mul_ps(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 141 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*141);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,rinv30);
+            felec            = _mm_mul_ps(velec,rinvsq30);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,rinv30);
+            felec            = _mm_mul_ps(velec,rinvsq30);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 130 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*130);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..68ea5f2
--- /dev/null
@@ -0,0 +1,2058 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,rinv13);
+            felec            = _mm_mul_ps(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,rinv23);
+            felec            = _mm_mul_ps(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,rinv31);
+            felec            = _mm_mul_ps(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,rinv32);
+            felec            = _mm_mul_ps(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,rinv33);
+            felec            = _mm_mul_ps(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 311 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,rinv13);
+            felec            = _mm_mul_ps(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,rinv23);
+            felec            = _mm_mul_ps(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,rinv31);
+            felec            = _mm_mul_ps(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,rinv32);
+            felec            = _mm_mul_ps(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,rinv33);
+            felec            = _mm_mul_ps(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 312 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*312);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,rinv13);
+            felec            = _mm_mul_ps(velec,rinvsq13);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,rinv23);
+            felec            = _mm_mul_ps(velec,rinvsq23);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,rinv31);
+            felec            = _mm_mul_ps(velec,rinvsq31);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,rinv32);
+            felec            = _mm_mul_ps(velec,rinvsq32);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,rinv33);
+            felec            = _mm_mul_ps(velec,rinvsq33);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 294 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,rinv13);
+            felec            = _mm_mul_ps(velec,rinvsq13);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,rinv23);
+            felec            = _mm_mul_ps(velec,rinvsq23);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,rinv31);
+            felec            = _mm_mul_ps(velec,rinvsq31);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,rinv32);
+            felec            = _mm_mul_ps(velec,rinvsq32);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,rinv33);
+            felec            = _mm_mul_ps(velec,rinvsq33);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 295 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*295);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..d26e65e
--- /dev/null
@@ -0,0 +1,646 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 40 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 40 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 12 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*40);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 34 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 34 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*34);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..df2a812
--- /dev/null
@@ -0,0 +1,974 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 96 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 96 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*96);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 88 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 88 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*88);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..ecda969
--- /dev/null
@@ -0,0 +1,1780 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,rinv01);
+            felec            = _mm_mul_ps(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,rinv02);
+            felec            = _mm_mul_ps(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 264 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,rinv01);
+            felec            = _mm_mul_ps(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,rinv02);
+            felec            = _mm_mul_ps(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 264 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*264);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,rinv01);
+            felec            = _mm_mul_ps(velec,rinvsq01);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,rinv02);
+            felec            = _mm_mul_ps(velec,rinvsq02);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 250 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,rinv01);
+            felec            = _mm_mul_ps(velec,rinvsq01);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,rinv02);
+            felec            = _mm_mul_ps(velec,rinvsq02);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 250 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*250);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..0f9f0dc
--- /dev/null
@@ -0,0 +1,1108 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,rinv30);
+            felec            = _mm_mul_ps(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 116 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,rinv30);
+            felec            = _mm_mul_ps(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 116 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*116);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,rinv30);
+            felec            = _mm_mul_ps(velec,rinvsq30);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,rinv30);
+            felec            = _mm_mul_ps(velec,rinvsq30);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*108);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..237af11
--- /dev/null
@@ -0,0 +1,1930 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,rinv13);
+            felec            = _mm_mul_ps(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,rinv23);
+            felec            = _mm_mul_ps(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,rinv31);
+            felec            = _mm_mul_ps(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,rinv32);
+            felec            = _mm_mul_ps(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,rinv33);
+            felec            = _mm_mul_ps(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 287 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,rinv13);
+            felec            = _mm_mul_ps(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,rinv23);
+            felec            = _mm_mul_ps(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,rinv31);
+            felec            = _mm_mul_ps(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,rinv32);
+            felec            = _mm_mul_ps(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,rinv33);
+            felec            = _mm_mul_ps(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 287 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*287);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,rinv13);
+            felec            = _mm_mul_ps(velec,rinvsq13);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,rinv23);
+            felec            = _mm_mul_ps(velec,rinvsq23);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,rinv31);
+            felec            = _mm_mul_ps(velec,rinvsq31);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,rinv32);
+            felec            = _mm_mul_ps(velec,rinvsq32);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,rinv33);
+            felec            = _mm_mul_ps(velec,rinvsq33);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 273 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,rinv13);
+            felec            = _mm_mul_ps(velec,rinvsq13);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,rinv23);
+            felec            = _mm_mul_ps(velec,rinvsq23);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,rinv31);
+            felec            = _mm_mul_ps(velec,rinvsq31);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,rinv32);
+            felec            = _mm_mul_ps(velec,rinvsq32);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,rinv33);
+            felec            = _mm_mul_ps(velec,rinvsq33);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 273 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*273);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..99b3fa3
--- /dev/null
@@ -0,0 +1,559 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 28 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 28 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 11 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*28);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 27 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 27 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*27);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..bf824ef
--- /dev/null
@@ -0,0 +1,887 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 84 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 84 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*84);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 81 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 81 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*81);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..040065d
--- /dev/null
@@ -0,0 +1,1723 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    qq00             = _mm_mul_ps(iq0,jq0);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,rinv01);
+            felec            = _mm_mul_ps(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,rinv02);
+            felec            = _mm_mul_ps(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 252 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,rinv01);
+            felec            = _mm_mul_ps(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,rinv02);
+            felec            = _mm_mul_ps(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 252 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*252);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    qq00             = _mm_mul_ps(iq0,jq0);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,rinv01);
+            felec            = _mm_mul_ps(velec,rinvsq01);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,rinv02);
+            felec            = _mm_mul_ps(velec,rinvsq02);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 243 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(velec,rinvsq00);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,rinv01);
+            felec            = _mm_mul_ps(velec,rinvsq01);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,rinv02);
+            felec            = _mm_mul_ps(velec,rinvsq02);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 243 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*243);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..ffb4b05
--- /dev/null
@@ -0,0 +1,887 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,rinv30);
+            felec            = _mm_mul_ps(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 84 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,rinv30);
+            felec            = _mm_mul_ps(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 84 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*84);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,rinv30);
+            felec            = _mm_mul_ps(velec,rinvsq30);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 81 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,rinv10);
+            felec            = _mm_mul_ps(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,rinv20);
+            felec            = _mm_mul_ps(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,rinv30);
+            felec            = _mm_mul_ps(velec,rinvsq30);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 81 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*81);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..a1f394c
--- /dev/null
@@ -0,0 +1,1723 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,rinv13);
+            felec            = _mm_mul_ps(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,rinv23);
+            felec            = _mm_mul_ps(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,rinv31);
+            felec            = _mm_mul_ps(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,rinv32);
+            felec            = _mm_mul_ps(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,rinv33);
+            felec            = _mm_mul_ps(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 252 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,rinv13);
+            felec            = _mm_mul_ps(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,rinv23);
+            felec            = _mm_mul_ps(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,rinv31);
+            felec            = _mm_mul_ps(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,rinv32);
+            felec            = _mm_mul_ps(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,rinv33);
+            felec            = _mm_mul_ps(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 252 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*252);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,rinv13);
+            felec            = _mm_mul_ps(velec,rinvsq13);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,rinv23);
+            felec            = _mm_mul_ps(velec,rinvsq23);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,rinv31);
+            felec            = _mm_mul_ps(velec,rinvsq31);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,rinv32);
+            felec            = _mm_mul_ps(velec,rinvsq32);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,rinv33);
+            felec            = _mm_mul_ps(velec,rinvsq33);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 243 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,rinv11);
+            felec            = _mm_mul_ps(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,rinv12);
+            felec            = _mm_mul_ps(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,rinv13);
+            felec            = _mm_mul_ps(velec,rinvsq13);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,rinv21);
+            felec            = _mm_mul_ps(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,rinv22);
+            felec            = _mm_mul_ps(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,rinv23);
+            felec            = _mm_mul_ps(velec,rinvsq23);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,rinv31);
+            felec            = _mm_mul_ps(velec,rinvsq31);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,rinv32);
+            felec            = _mm_mul_ps(velec,rinvsq32);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,rinv33);
+            felec            = _mm_mul_ps(velec,rinvsq33);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 243 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*243);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..9dfa80f
--- /dev/null
@@ -0,0 +1,772 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 64 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 65 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 12 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*65);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 46 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 47 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*47);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..15afb7d
--- /dev/null
@@ -0,0 +1,1280 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 156 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 159 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*159);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 124 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 127 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*127);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..be31d87
--- /dev/null
@@ -0,0 +1,2626 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 432 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 441 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*441);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 358 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 367 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*367);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..b6f28fa
--- /dev/null
@@ -0,0 +1,1450 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 179 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 182 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*182);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 147 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 150 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*150);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..eaef498
--- /dev/null
@@ -0,0 +1,2812 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 458 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 467 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*467);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 384 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 393 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*393);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..ce22870
--- /dev/null
@@ -0,0 +1,675 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 46 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 47 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 11 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*47);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 39 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 40 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*40);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..c295b34
--- /dev/null
@@ -0,0 +1,1183 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 138 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 141 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*141);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 117 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*120);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..a41a7f8
--- /dev/null
@@ -0,0 +1,2559 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    qq00             = _mm_mul_ps(iq0,jq0);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 414 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 423 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*423);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    qq00             = _mm_mul_ps(iq0,jq0);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 360 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*360);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..a6b2a2f
--- /dev/null
@@ -0,0 +1,1183 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 138 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 141 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*141);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 117 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*120);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..dd74d08
--- /dev/null
@@ -0,0 +1,2559 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 414 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 423 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*423);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 360 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*360);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..b3a6e65
--- /dev/null
@@ -0,0 +1,856 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 83 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 84 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 12 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*84);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 77 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 78 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*78);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..5db012f
--- /dev/null
@@ -0,0 +1,1468 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 213 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 216 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*216);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 201 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 204 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*204);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..fdd7523
--- /dev/null
@@ -0,0 +1,3126 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            d                = _mm_sub_ps(r01,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            d                = _mm_sub_ps(r02,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            d                = _mm_sub_ps(r11,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            d                = _mm_sub_ps(r12,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            d                = _mm_sub_ps(r21,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            d                = _mm_sub_ps(r22,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 603 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            d                = _mm_sub_ps(r01,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            d                = _mm_sub_ps(r02,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            d                = _mm_sub_ps(r11,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            d                = _mm_sub_ps(r12,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            d                = _mm_sub_ps(r21,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            d                = _mm_sub_ps(r22,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 612 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*612);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            d                = _mm_sub_ps(r01,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            d                = _mm_sub_ps(r02,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            d                = _mm_sub_ps(r11,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            d                = _mm_sub_ps(r12,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            d                = _mm_sub_ps(r21,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            d                = _mm_sub_ps(r22,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 573 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            d                = _mm_sub_ps(r01,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            d                = _mm_sub_ps(r02,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            d                = _mm_sub_ps(r11,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            d                = _mm_sub_ps(r12,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            d                = _mm_sub_ps(r21,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            d                = _mm_sub_ps(r22,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 582 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*582);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..7dfac8d
--- /dev/null
@@ -0,0 +1,1688 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            d                = _mm_sub_ps(r30,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 254 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            d                = _mm_sub_ps(r30,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 258 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*258);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            d                = _mm_sub_ps(r30,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 242 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            d                = _mm_sub_ps(r30,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 246 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*246);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..51bf341
--- /dev/null
@@ -0,0 +1,3362 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            d                = _mm_sub_ps(r11,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            d                = _mm_sub_ps(r12,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            d                = _mm_sub_ps(r13,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            d                = _mm_sub_ps(r21,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            d                = _mm_sub_ps(r22,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            d                = _mm_sub_ps(r23,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            d                = _mm_sub_ps(r31,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            d                = _mm_sub_ps(r32,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            d                = _mm_sub_ps(r33,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 647 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            d                = _mm_sub_ps(r11,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            d                = _mm_sub_ps(r12,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            d                = _mm_sub_ps(r13,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            d                = _mm_sub_ps(r21,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            d                = _mm_sub_ps(r22,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            d                = _mm_sub_ps(r23,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            d                = _mm_sub_ps(r31,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            d                = _mm_sub_ps(r32,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            d                = _mm_sub_ps(r33,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 657 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*657);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            d                = _mm_sub_ps(r11,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            d                = _mm_sub_ps(r12,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            d                = _mm_sub_ps(r13,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            d                = _mm_sub_ps(r21,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            d                = _mm_sub_ps(r22,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            d                = _mm_sub_ps(r23,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            d                = _mm_sub_ps(r31,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            d                = _mm_sub_ps(r32,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            d                = _mm_sub_ps(r33,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 617 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            d                = _mm_sub_ps(r11,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            d                = _mm_sub_ps(r12,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            d                = _mm_sub_ps(r13,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            d                = _mm_sub_ps(r21,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            d                = _mm_sub_ps(r22,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            d                = _mm_sub_ps(r23,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            d                = _mm_sub_ps(r31,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            d                = _mm_sub_ps(r32,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            d                = _mm_sub_ps(r33,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 627 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*627);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..246a6e3
--- /dev/null
@@ -0,0 +1,755 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 65 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 66 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 11 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*66);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 63 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*63);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..7929909
--- /dev/null
@@ -0,0 +1,1367 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 195 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 198 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*198);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 186 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 189 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*189);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..da9e66a
--- /dev/null
@@ -0,0 +1,3055 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    qq00             = _mm_mul_ps(iq0,jq0);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            d                = _mm_sub_ps(r01,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            d                = _mm_sub_ps(r02,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            d                = _mm_sub_ps(r11,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            d                = _mm_sub_ps(r12,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            d                = _mm_sub_ps(r21,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            d                = _mm_sub_ps(r22,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 585 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            d                = _mm_sub_ps(r01,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            d                = _mm_sub_ps(r02,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            d                = _mm_sub_ps(r11,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            d                = _mm_sub_ps(r12,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            d                = _mm_sub_ps(r21,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            d                = _mm_sub_ps(r22,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 594 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*594);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    qq00             = _mm_mul_ps(iq0,jq0);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            d                = _mm_sub_ps(r01,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            d                = _mm_sub_ps(r02,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            d                = _mm_sub_ps(r11,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            d                = _mm_sub_ps(r12,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            d                = _mm_sub_ps(r21,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            d                = _mm_sub_ps(r22,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 558 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            d                = _mm_sub_ps(r01,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            d                = _mm_sub_ps(r02,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            d                = _mm_sub_ps(r11,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            d                = _mm_sub_ps(r12,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            d                = _mm_sub_ps(r21,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            d                = _mm_sub_ps(r22,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 567 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*567);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..539423e
--- /dev/null
@@ -0,0 +1,1367 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            d                = _mm_sub_ps(r30,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 195 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            d                = _mm_sub_ps(r30,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 198 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*198);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            d                = _mm_sub_ps(r30,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 186 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            d                = _mm_sub_ps(r10,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            d                = _mm_sub_ps(r20,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            d                = _mm_sub_ps(r30,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 189 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*189);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..1dda59c
--- /dev/null
@@ -0,0 +1,3055 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            d                = _mm_sub_ps(r11,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            d                = _mm_sub_ps(r12,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            d                = _mm_sub_ps(r13,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            d                = _mm_sub_ps(r21,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            d                = _mm_sub_ps(r22,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            d                = _mm_sub_ps(r23,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            d                = _mm_sub_ps(r31,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            d                = _mm_sub_ps(r32,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            d                = _mm_sub_ps(r33,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 585 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            d                = _mm_sub_ps(r11,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            d                = _mm_sub_ps(r12,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            d                = _mm_sub_ps(r13,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            d                = _mm_sub_ps(r21,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            d                = _mm_sub_ps(r22,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            d                = _mm_sub_ps(r23,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            d                = _mm_sub_ps(r31,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            d                = _mm_sub_ps(r32,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            d                = _mm_sub_ps(r33,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
+            velec            = _mm_mul_ps(velec,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 594 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*594);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            d                = _mm_sub_ps(r11,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            d                = _mm_sub_ps(r12,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            d                = _mm_sub_ps(r13,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            d                = _mm_sub_ps(r21,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            d                = _mm_sub_ps(r22,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            d                = _mm_sub_ps(r23,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            d                = _mm_sub_ps(r31,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            d                = _mm_sub_ps(r32,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            d                = _mm_sub_ps(r33,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 558 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            d                = _mm_sub_ps(r11,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            d                = _mm_sub_ps(r12,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            d                = _mm_sub_ps(r13,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            d                = _mm_sub_ps(r21,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            d                = _mm_sub_ps(r22,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            d                = _mm_sub_ps(r23,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            d                = _mm_sub_ps(r31,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            d                = _mm_sub_ps(r32,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            d                = _mm_sub_ps(r33,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 567 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*567);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..c078658
--- /dev/null
@@ -0,0 +1,832 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 75 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 76 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 12 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*76);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 62 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 63 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*63);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..7e652f0
--- /dev/null
@@ -0,0 +1,1264 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 157 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 160 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*160);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 134 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 137 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*137);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..c88daa1
--- /dev/null
@@ -0,0 +1,2382 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 403 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 412 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*412);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 350 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 359 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*359);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..2e0ce98
--- /dev/null
@@ -0,0 +1,1408 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 179 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 183 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*183);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 156 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 160 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*160);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..b42e52d
--- /dev/null
@@ -0,0 +1,2542 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 428 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 438 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*438);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 375 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 385 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*385);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..bddd767
--- /dev/null
@@ -0,0 +1,714 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 53 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 54 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 12 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*54);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 43 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 44 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*44);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..f0d6ed0
--- /dev/null
@@ -0,0 +1,1146 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 135 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 138 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*138);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 115 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 118 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*118);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..3e4a488
--- /dev/null
@@ -0,0 +1,2264 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 381 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 390 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*390);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 331 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 340 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*340);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..8746ecb
--- /dev/null
@@ -0,0 +1,1280 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 155 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 158 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*158);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 135 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 138 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*138);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..1faa292
--- /dev/null
@@ -0,0 +1,2414 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 404 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 413 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*413);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 354 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 363 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*363);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..0b4dfe6
--- /dev/null
@@ -0,0 +1,627 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 41 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 42 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 11 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*42);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 36 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 37 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*37);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..ccc37cb
--- /dev/null
@@ -0,0 +1,1059 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 123 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 126 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*126);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 111 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*111);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..3cc1f50
--- /dev/null
@@ -0,0 +1,2207 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    qq00             = _mm_mul_ps(iq0,jq0);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 369 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*378);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    qq00             = _mm_mul_ps(iq0,jq0);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 333 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*333);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..fb1468d
--- /dev/null
@@ -0,0 +1,1059 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 123 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 126 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*126);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 111 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*111);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..c2230f2
--- /dev/null
@@ -0,0 +1,2207 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 369 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*378);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 333 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*333);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..ebae1c3
--- /dev/null
@@ -0,0 +1,925 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          gbitab;
+    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+    __m128           minushalf = _mm_set1_ps(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        isai0            = _mm_load1_ps(invsqrta+inr+0);
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vgbsum           = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+        dvdasum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _mm_mul_ps(isai0,isaj0);
+            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
+            dvdaj            = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _mm_mul_ps(r00,gbscale);
+            gbitab           = _mm_cvttps_epi32(rt);
+            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+            gbitab           = _mm_slli_epi32(gbitab,2);
+
+            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(gbeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+            vgb              = _mm_mul_ps(gbqqfactor,VV);
+
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
+            gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+                                          _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vgbsum           = _mm_add_ps(vgbsum,vgb);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 92 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _mm_mul_ps(isai0,isaj0);
+            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
+            dvdaj            = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _mm_mul_ps(r00,gbscale);
+            gbitab           = _mm_cvttps_epi32(rt);
+            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+            gbitab           = _mm_slli_epi32(gbitab,2);
+
+            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(gbeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+            vgb              = _mm_mul_ps(gbqqfactor,VV);
+
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
+            gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+                                          _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vgb              = _mm_andnot_ps(dummy_mask,vgb);
+            vgbsum           = _mm_add_ps(vgbsum,vgb);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
+        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 13 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*13 + inneriter*93);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          gbitab;
+    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+    __m128           minushalf = _mm_set1_ps(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        isai0            = _mm_load1_ps(invsqrta+inr+0);
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        dvdasum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _mm_mul_ps(isai0,isaj0);
+            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
+            dvdaj            = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _mm_mul_ps(r00,gbscale);
+            gbitab           = _mm_cvttps_epi32(rt);
+            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+            gbitab           = _mm_slli_epi32(gbitab,2);
+
+            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(gbeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+            vgb              = _mm_mul_ps(gbqqfactor,VV);
+
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
+            gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+                                          _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 82 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _mm_mul_ps(isai0,isaj0);
+            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
+            dvdaj            = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _mm_mul_ps(r00,gbscale);
+            gbitab           = _mm_cvttps_epi32(rt);
+            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+            gbitab           = _mm_slli_epi32(gbitab,2);
+
+            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(gbeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+            vgb              = _mm_mul_ps(gbqqfactor,VV);
+
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
+            gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+                                          _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 83 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
+        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*83);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..fab1a85
--- /dev/null
@@ -0,0 +1,823 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          gbitab;
+    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+    __m128           minushalf = _mm_set1_ps(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        isai0            = _mm_load1_ps(invsqrta+inr+0);
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vgbsum           = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+        dvdasum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _mm_mul_ps(isai0,isaj0);
+            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
+            dvdaj            = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _mm_mul_ps(r00,gbscale);
+            gbitab           = _mm_cvttps_epi32(rt);
+            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+            gbitab           = _mm_slli_epi32(gbitab,2);
+
+            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(gbeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+            vgb              = _mm_mul_ps(gbqqfactor,VV);
+
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
+            gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+                                          _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vgbsum           = _mm_add_ps(vgbsum,vgb);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 71 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _mm_mul_ps(isai0,isaj0);
+            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
+            dvdaj            = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _mm_mul_ps(r00,gbscale);
+            gbitab           = _mm_cvttps_epi32(rt);
+            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+            gbitab           = _mm_slli_epi32(gbitab,2);
+
+            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(gbeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+            vgb              = _mm_mul_ps(gbqqfactor,VV);
+
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
+            gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+                                          _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vgb              = _mm_andnot_ps(dummy_mask,vgb);
+            vgbsum           = _mm_add_ps(vgbsum,vgb);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 72 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
+        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 13 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*13 + inneriter*72);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          gbitab;
+    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+    __m128           minushalf = _mm_set1_ps(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        isai0            = _mm_load1_ps(invsqrta+inr+0);
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        dvdasum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _mm_mul_ps(isai0,isaj0);
+            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
+            dvdaj            = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _mm_mul_ps(r00,gbscale);
+            gbitab           = _mm_cvttps_epi32(rt);
+            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+            gbitab           = _mm_slli_epi32(gbitab,2);
+
+            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(gbeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+            vgb              = _mm_mul_ps(gbqqfactor,VV);
+
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
+            gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+                                          _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 64 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _mm_mul_ps(isai0,isaj0);
+            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
+            dvdaj            = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _mm_mul_ps(r00,gbscale);
+            gbitab           = _mm_cvttps_epi32(rt);
+            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+            gbitab           = _mm_slli_epi32(gbitab,2);
+
+            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(gbeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+            vgb              = _mm_mul_ps(gbqqfactor,VV);
+
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
+            gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+                                          _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 65 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
+        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*65);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..fdbf8e1
--- /dev/null
@@ -0,0 +1,728 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          gbitab;
+    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+    __m128           minushalf = _mm_set1_ps(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        isai0            = _mm_load1_ps(invsqrta+inr+0);
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vgbsum           = _mm_setzero_ps();
+        dvdasum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _mm_mul_ps(isai0,isaj0);
+            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
+            dvdaj            = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _mm_mul_ps(r00,gbscale);
+            gbitab           = _mm_cvttps_epi32(rt);
+            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+            gbitab           = _mm_slli_epi32(gbitab,2);
+
+            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(gbeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+            vgb              = _mm_mul_ps(gbqqfactor,VV);
+
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
+            gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+                                          _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vgbsum           = _mm_add_ps(vgbsum,vgb);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 58 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _mm_mul_ps(isai0,isaj0);
+            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
+            dvdaj            = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _mm_mul_ps(r00,gbscale);
+            gbitab           = _mm_cvttps_epi32(rt);
+            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+            gbitab           = _mm_slli_epi32(gbitab,2);
+
+            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(gbeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+            vgb              = _mm_mul_ps(gbqqfactor,VV);
+
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
+            gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+                                          _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vgb              = _mm_andnot_ps(dummy_mask,vgb);
+            vgbsum           = _mm_add_ps(vgbsum,vgb);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
+        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
+        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 12 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*12 + inneriter*59);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128i          gbitab;
+    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+    __m128           minushalf = _mm_set1_ps(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        isai0            = _mm_load1_ps(invsqrta+inr+0);
+
+        dvdasum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _mm_mul_ps(isai0,isaj0);
+            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
+            dvdaj            = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _mm_mul_ps(r00,gbscale);
+            gbitab           = _mm_cvttps_epi32(rt);
+            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+            gbitab           = _mm_slli_epi32(gbitab,2);
+
+            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(gbeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+            vgb              = _mm_mul_ps(gbqqfactor,VV);
+
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
+            gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+                                          _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 56 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            isaj0            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
+                                                              invsqrta+jnrC+0,invsqrta+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _mm_mul_ps(isai0,isaj0);
+            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
+            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
+            dvdaj            = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _mm_mul_ps(r00,gbscale);
+            gbitab           = _mm_cvttps_epi32(rt);
+            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+            gbitab           = _mm_slli_epi32(gbitab,2);
+
+            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(gbeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+            vgb              = _mm_mul_ps(gbqqfactor,VV);
+
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
+            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
+            gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+                                          _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+            velec            = _mm_mul_ps(qq00,rinv00);
+            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 57 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
+        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*57);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwCSTab_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwCSTab_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..4332009
--- /dev/null
@@ -0,0 +1,723 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 56 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 57 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*10 + inneriter*57);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 48 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 49 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*9 + inneriter*49);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwLJSh_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwLJSh_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..143a6b0
--- /dev/null
@@ -0,0 +1,649 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 41 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 41 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*10 + inneriter*41);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 30 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 30 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*9 + inneriter*30);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwLJSw_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwLJSw_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..1734983
--- /dev/null
@@ -0,0 +1,735 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 59 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 60 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*10 + inneriter*60);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 56 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 57 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*9 + inneriter*57);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwLJ_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwLJ_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..1462dd3
--- /dev/null
@@ -0,0 +1,595 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 32 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 32 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*10 + inneriter*32);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 27 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 27 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*9 + inneriter*27);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..acb5504
--- /dev/null
@@ -0,0 +1,828 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 72 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 73 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 12 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*73);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 57 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 58 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*58);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..7c57e51
--- /dev/null
@@ -0,0 +1,1228 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 144 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 145 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*145);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 117 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 118 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*118);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..167bebb
--- /dev/null
@@ -0,0 +1,2250 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 360 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 361 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*361);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 298 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*298);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..f9784b6
--- /dev/null
@@ -0,0 +1,1360 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 164 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 165 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*165);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 138 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 139 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*139);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..614a2ab
--- /dev/null
@@ -0,0 +1,2398 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 383 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 384 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*384);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 321 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 322 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*322);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..a1c11cb
--- /dev/null
@@ -0,0 +1,708 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 54 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 54 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 12 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*54);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 37 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 37 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*37);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..9850989
--- /dev/null
@@ -0,0 +1,1108 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 126 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 126 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*126);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 97 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 97 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*97);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..9a5a0b7
--- /dev/null
@@ -0,0 +1,2130 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 342 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 342 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*342);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 277 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 277 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*277);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..d1e495e
--- /dev/null
@@ -0,0 +1,1278 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 149 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 149 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*149);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 120 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*120);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..d7edf86
--- /dev/null
@@ -0,0 +1,2316 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 368 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 368 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*368);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 303 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 303 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*303);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..8ff330d
--- /dev/null
@@ -0,0 +1,786 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 70 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 71 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 12 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*71);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 61 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*62);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..7b9b449
--- /dev/null
@@ -0,0 +1,1186 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 142 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 143 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*143);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 121 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 122 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*122);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..03db8e3
--- /dev/null
@@ -0,0 +1,2208 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 358 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 359 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*359);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 301 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 302 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*302);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..30c5e84
--- /dev/null
@@ -0,0 +1,1360 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 167 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 168 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*168);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 146 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 147 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*147);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..0adb77b
--- /dev/null
@@ -0,0 +1,2398 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 386 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 387 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*387);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 329 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            d                = _mm_sub_ps(r00,rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 330 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*330);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..b31d4e7
--- /dev/null
@@ -0,0 +1,611 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 36 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 36 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 11 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*36);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 30 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 30 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*30);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..60fe430
--- /dev/null
@@ -0,0 +1,1011 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 108 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 108 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*108);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 90 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 90 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*90);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..82d9f89
--- /dev/null
@@ -0,0 +1,2063 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    qq00             = _mm_mul_ps(iq0,jq0);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*324);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    qq00             = _mm_mul_ps(iq0,jq0);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*270);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..ab47dc6
--- /dev/null
@@ -0,0 +1,1011 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 108 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 108 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*108);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 90 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 90 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*90);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..5030b0b
--- /dev/null
@@ -0,0 +1,2063 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*324);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*270);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..c3c548a
--- /dev/null
@@ -0,0 +1,778 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 67 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 68 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 12 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*68);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 54 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 55 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*55);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..fb01e12
--- /dev/null
@@ -0,0 +1,1102 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 131 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 132 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*132);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 109 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*109);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..53e1d86
--- /dev/null
@@ -0,0 +1,1896 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 323 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*324);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 271 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*271);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..545a0d1
--- /dev/null
@@ -0,0 +1,1236 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 152 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 153 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*153);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 130 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*130);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..ffd647a
--- /dev/null
@@ -0,0 +1,2046 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 347 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_00,VV);
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 348 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*348);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 294 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r00,vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            vfitab           = _mm_slli_epi32(vfitab,3);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_00,FF);
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 295 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*295);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..fb3a721
--- /dev/null
@@ -0,0 +1,650 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 44 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 44 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 12 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*44);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 34 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 34 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*34);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..b12cf69
--- /dev/null
@@ -0,0 +1,974 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*108);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 88 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 88 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*88);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..4732af9
--- /dev/null
@@ -0,0 +1,1768 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 300 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 300 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 29 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*300);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 250 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 250 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*250);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..0fda25e
--- /dev/null
@@ -0,0 +1,1108 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 128 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 128 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*128);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*108);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..501c5ed
--- /dev/null
@@ -0,0 +1,1918 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 323 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _mm_mul_ps(c6_00,rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 323 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 38 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*323);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 273 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = gmx_mm_inv_ps(rsq00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 273 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 36 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*273);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomP1P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..ec00cfc
--- /dev/null
@@ -0,0 +1,563 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 32 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 32 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 11 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*32);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 27 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 27 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*27);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomW3P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..e413519
--- /dev/null
@@ -0,0 +1,887 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 96 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 96 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*96);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 81 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 81 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*81);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomW3W3_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..3c1d2d8
--- /dev/null
@@ -0,0 +1,1711 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    qq00             = _mm_mul_ps(iq0,jq0);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 288 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 288 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*288);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    qq00             = _mm_mul_ps(iq0,jq0);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
+        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
+        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 243 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 243 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*243);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomW4P1_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..6aaa31d
--- /dev/null
@@ -0,0 +1,887 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 96 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 96 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*96);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 81 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+
+            /* Inner loop uses 81 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*81);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomW4W4_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..f62be8b
--- /dev/null
@@ -0,0 +1,1711 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 288 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 288 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 28 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*288);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_single
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
+        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
+        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
+        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
+        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
+        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
+        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
+        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 243 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 243 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 27 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*243);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_sse2_single.c b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_sse2_single.c
new file mode 100644 (file)
index 0000000..ddaa369
--- /dev/null
@@ -0,0 +1,465 @@
+/*
+ * Note: this file was generated by the Gromacs sse2_single kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifndef nb_kernel_sse2_single_h
+#define nb_kernel_sse2_single_h
+
+#include "../nb_kernel.h"
+
+nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single;
+
+
+nb_kernel_info_t
+kernellist_sse2_single[] =
+{
+    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "None", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_single, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_single", "sse2_single", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "Coulomb", "None", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_single, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_single", "sse2_single", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_single, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_single", "sse2_single", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_single, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_single, "nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "None", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single", "sse2_single", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" }
+};
+
+int
+kernellist_sse2_single_size = sizeof(kernellist_sse2_single)/sizeof(kernellist_sse2_single[0]);
+
+#endif
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_sse2_single.h b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_sse2_single.h
new file mode 100644 (file)
index 0000000..dab81ab
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+ * Note: this file was generated by the Gromacs c kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifndef nb_kernel_sse2_single_h
+#define nb_kernel_sse2_single_h
+
+#include "../nb_kernel.h"
+
+
+/* List of kernels for this architecture with metadata about them */
+extern nb_kernel_info_t
+kernellist_sse2_single[];
+
+/* Length of kernellist_c */
+extern int
+kernellist_sse2_single_size;
+
+#endif
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_template_sse2_single.pre b/src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_template_sse2_single.pre
new file mode 100644 (file)
index 0000000..7173bdd
--- /dev/null
@@ -0,0 +1,982 @@
+/* #if 0 */
+#error This file must be processed with the Gromacs pre-preprocessor
+/* #endif */
+/* #if INCLUDE_HEADER */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gmx_math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+/* #endif */
+
+/* ## List of variables set by the generating script:                                    */
+/* ##                                                                                    */
+/* ## Setttings that apply to the entire kernel:                                         */
+/* ## KERNEL_ELEC:           String, choice for electrostatic interactions               */
+/* ## KERNEL_VDW:            String, choice for van der Waals interactions               */
+/* ## KERNEL_NAME:           String, name of this kernel                                 */
+/* ## KERNEL_VF:             String telling if we calculate potential, force, or both    */
+/* ## GEOMETRY_I/GEOMETRY_J: String, name of each geometry, e.g. 'Water3' or '1Particle' */
+/* ##                                                                                    */
+/* ## Setttings that apply to particles in the outer (I) or inner (J) loops:             */
+/* ## PARTICLES_I[]/         Arrays with lists of i/j particles to use in kernel. It is  */
+/* ## PARTICLES_J[]:         just [0] for particle geometry, but can be longer for water */
+/* ## PARTICLES_ELEC_I[]/    Arrays with lists of i/j particle that have electrostatics  */
+/* ## PARTICLES_ELEC_J[]:    interactions that should be calculated in this kernel.      */
+/* ## PARTICLES_VDW_I[]/     Arrays with the list of i/j particle that have VdW          */
+/* ## PARTICLES_VDW_J[]:     interactions that should be calculated in this kernel.      */
+/* ##                                                                                    */
+/* ## Setttings for pairs of interactions (e.g. 2nd i particle against 1st j particle)   */
+/* ## PAIRS_IJ[]:            Array with (i,j) tuples of pairs for which interactions     */
+/* ##                        should be calculated in this kernel. Zero-charge particles  */
+/* ##                        do not have interactions with particles without vdw, and    */
+/* ##                        Vdw-only interactions are not evaluated in a no-vdw-kernel. */
+/* ## INTERACTION_FLAGS[][]: 2D matrix, dimension e.g. 3*3 for water-water interactions. */
+/* ##                        For each i-j pair, the element [I][J] is a list of strings  */
+/* ##                        defining properties/flags of this interaction. Examples     */
+/* ##                        include 'electrostatics'/'vdw' if that type of interaction  */
+/* ##                        should be evaluated, 'rsq'/'rinv'/'rinvsq' if those values  */
+/* ##                        are needed, and 'exactcutoff' or 'shift','switch' to        */
+/* ##                        decide if the force/potential should be modified. This way  */
+/* ##                        we only calculate values absolutely needed for each case.   */
+
+/* ## Calculate the size and offset for (merged/interleaved) table data */
+
+/*
+ * Gromacs nonbonded kernel:   {KERNEL_NAME}
+ * Electrostatics interaction: {KERNEL_ELEC}
+ * VdW interaction:            {KERNEL_VDW}
+ * Geometry:                   {GEOMETRY_I}-{GEOMETRY_J}
+ * Calculate force/pot:        {KERNEL_VF}
+ */
+void
+{KERNEL_NAME}
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* ## Not all variables are used for all kernels, but any optimizing compiler fixes that, */
+    /* ## so there is no point in going to extremes to exclude variables that are not needed. */
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             shX,shY,shZ,rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    /* #for I in PARTICLES_I */
+    int              vdwioffset{I};
+    __m128           ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
+    /* #endfor */
+    /* #for J in PARTICLES_J */
+    int              vdwjidx{J}A,vdwjidx{J}B,vdwjidx{J}C,vdwjidx{J}D;
+    __m128           jx{J},jy{J},jz{J},fjx{J},fjy{J},fjz{J},jq{J},isaj{J};
+    /* #endfor */
+    /* #for I,J in PAIRS_IJ */
+    __m128           dx{I}{J},dy{I}{J},dz{I}{J},rsq{I}{J},rinv{I}{J},rinvsq{I}{J},r{I}{J},qq{I}{J},c6_{I}{J},c12_{I}{J};
+    /* #endfor */
+    /* #if KERNEL_ELEC != 'None' */
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    /* #endif */
+    /* #if 'GeneralizedBorn' in KERNEL_ELEC */
+    __m128i          gbitab;
+    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+    __m128           minushalf = _mm_set1_ps(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    /* #endif */
+    /* #if KERNEL_VDW != 'None' */
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    /* #endif */
+    /* #if 'Table' in KERNEL_ELEC or 'GeneralizedBorn' in KERNEL_ELEC or 'Table' in KERNEL_VDW */
+    __m128i          vfitab;
+    __m128i          ifour       = _mm_set1_epi32(4);
+    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
+    real             *vftab;
+    /* #endif */
+    /* #if 'Ewald' in KERNEL_ELEC */
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    /* #endif */
+    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
+    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    /* #endif */
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    /* #if KERNEL_ELEC != 'None' */
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    /*     #if 'ReactionField' in KERNEL_ELEC */
+    krf              = _mm_set1_ps(fr->ic->k_rf);
+    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
+    crf              = _mm_set1_ps(fr->ic->c_rf);
+    /*     #endif */
+    /* #endif */
+    /* #if KERNEL_VDW != 'None' */
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    /* #endif */
+
+    /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
+    /* #elif 'Table' in KERNEL_ELEC */
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
+    /* #elif 'Table' in KERNEL_VDW */
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
+    /* #endif */
+
+    /* #if 'Ewald' in KERNEL_ELEC */
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    /*     #if KERNEL_VF=='Force' and KERNEL_MOD_ELEC!='PotentialSwitch' */
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+    /*     #else */
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+     /*     #endif */
+    /* #endif */
+
+    /* #if KERNEL_ELEC=='GeneralizedBorn' */
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = _mm_set1_ps(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+    /* #endif */
+
+    /* #if 'Water' in GEOMETRY_I */
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    /*     #for I in PARTICLES_ELEC_I */
+    iq{I}              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+{I}]));
+    /*     #endfor */
+    /*     #for I in PARTICLES_VDW_I */
+    vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
+    /*     #endfor */
+    /* #endif */
+
+    /* #if 'Water' in GEOMETRY_J */
+    /*     #for J in PARTICLES_ELEC_J */
+    jq{J}              = _mm_set1_ps(charge[inr+{J}]);
+    /*     #endfor */
+    /*     #for J in PARTICLES_VDW_J */
+    vdwjidx{J}A        = 2*vdwtype[inr+{J}];
+    /*     #endfor */
+    /*     #for I,J in PAIRS_IJ */
+    /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+    qq{I}{J}             = _mm_mul_ps(iq{I},jq{J});
+    /*         #endif */
+    /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+    c6_{I}{J}            = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
+    c12_{I}{J}           = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    /*         #endif */
+    /*     #endfor */
+    /* #endif */
+
+    /* #if KERNEL_MOD_ELEC!='None' or KERNEL_MOD_VDW!='None' */
+    /*     #if KERNEL_ELEC!='None' */
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    /*     #else */
+    rcutoff_scalar   = fr->rvdw;
+    /*     #endif */
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+    /* #endif */
+
+    /* #if KERNEL_MOD_VDW=='PotentialShift' */
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+    /* #endif */
+
+    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
+    /*     #if KERNEL_MOD_ELEC=='PotentialSwitch'  */
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /*     #else */
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = _mm_set1_ps(rswitch_scalar);
+    /*     #endif */
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = _mm_set1_ps(d_scalar);
+    swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    /*     #if 'Force' in KERNEL_VF */
+    swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    /*     #endif */
+    /* #endif */
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    /* ## Keep track of the floating point operations we issue for reporting! */
+    /* #define OUTERFLOPS 0 */
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        /* ## Loop over i particles, but only include ones that we use - skip e.g. vdw-only sites for elec-only kernel */
+        /*     #for I in PARTICLES_I */
+        ix{I}              = _mm_set1_ps(shX + x[i_coord_offset+DIM*{I}+XX]);
+        iy{I}              = _mm_set1_ps(shY + x[i_coord_offset+DIM*{I}+YY]);
+        iz{I}              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*{I}+ZZ]);
+        /*     #define OUTERFLOPS OUTERFLOPS+3 */
+        /* #endfor */
+
+        /* #if 'Force' in KERNEL_VF */
+        /*     #for I in PARTICLES_I */
+        fix{I}             = _mm_setzero_ps();
+        fiy{I}             = _mm_setzero_ps();
+        fiz{I}             = _mm_setzero_ps();
+        /*     #endfor */
+        /* #endif */
+
+        /* ## For water we already preloaded parameters at the start of the kernel */
+        /* #if not 'Water' in GEOMETRY_I */
+        /* Load parameters for i particles */
+        /*     #for I in PARTICLES_ELEC_I */
+        iq{I}              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+{I}));
+        /*         #define OUTERFLOPS OUTERFLOPS+1 */
+        /*         #if KERNEL_ELEC=='GeneralizedBorn' */
+        isai{I}            = _mm_load1_ps(invsqrta+inr+{I});
+        /*         #endif */
+        /*     #endfor */
+        /*     #for I in PARTICLES_VDW_I */
+        vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
+        /*     #endfor */
+        /* #endif */
+
+        /* #if 'Potential' in KERNEL_VF */
+        /* Reset potential sums */
+        /*     #if KERNEL_ELEC != 'None' */
+        velecsum         = _mm_setzero_ps();
+        /*     #endif */
+        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
+        vgbsum           = _mm_setzero_ps();
+        /*     #endif */
+        /*     #if KERNEL_VDW != 'None' */
+        vvdwsum          = _mm_setzero_ps();
+        /*     #endif */
+        /* #endif */
+        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
+        dvdasum          = _mm_setzero_ps();
+        /*     #endif */
+
+        /* #for ROUND in ['Loop','Epilogue'] */
+
+        /* #if ROUND =='Loop' */
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+        /* ## First round is normal loop (next statement resets indentation) */
+        /*     #if 0 */
+        }
+        /*     #endif */
+        /* #else */
+        if(jidx<j_index_end)
+        {
+        /* ## Second round is epilogue */
+        /* #endif */
+        /* #define INNERFLOPS 0 */
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+
+            /* #if ROUND =='Epilogue' */
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrA>=0) ? jnrA : 0;
+            jnrB       = (jnrB>=0) ? jnrB : 0;
+            jnrC       = (jnrC>=0) ? jnrC : 0;
+            jnrD       = (jnrD>=0) ? jnrD : 0;
+
+            /* #endif */
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            /* #if GEOMETRY_J == 'Particle'             */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+            /* #elif GEOMETRY_J == 'Water3'             */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+            /* #elif GEOMETRY_J == 'Water4'             */
+            /*     #if 0 in PARTICLES_J                 */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+            /*     #else                                */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+            /*     #endif                               */
+            /* #endif                                   */
+
+            /* Calculate displacement vector */
+            /* #for I,J in PAIRS_IJ */
+            dx{I}{J}             = _mm_sub_ps(ix{I},jx{J});
+            dy{I}{J}             = _mm_sub_ps(iy{I},jy{J});
+            dz{I}{J}             = _mm_sub_ps(iz{I},jz{J});
+            /*     #define INNERFLOPS INNERFLOPS+3 */
+            /* #endfor */
+
+            /* Calculate squared distance and things based on it */
+            /* #for I,J in PAIRS_IJ */
+            rsq{I}{J}            = gmx_mm_calc_rsq_ps(dx{I}{J},dy{I}{J},dz{I}{J});
+            /*     #define INNERFLOPS INNERFLOPS+5 */
+            /* #endfor */
+
+            /* #for I,J in PAIRS_IJ */
+            /*     #if 'rinv' in INTERACTION_FLAGS[I][J] */
+            rinv{I}{J}           = gmx_mm_invsqrt_ps(rsq{I}{J});
+            /*         #define INNERFLOPS INNERFLOPS+5 */
+            /*     #endif */
+            /* #endfor */
+
+            /* #for I,J in PAIRS_IJ */
+            /*     #if 'rinvsq' in INTERACTION_FLAGS[I][J] */
+            /*         # if 'rinv' not in INTERACTION_FLAGS[I][J] */
+            rinvsq{I}{J}         = gmx_mm_inv_ps(rsq{I}{J});
+            /*             #define INNERFLOPS INNERFLOPS+4 */
+            /*         #else */
+            rinvsq{I}{J}         = _mm_mul_ps(rinv{I}{J},rinv{I}{J});
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*         #endif */
+            /*     #endif */
+            /* #endfor */
+
+            /* #if not 'Water' in GEOMETRY_J */
+            /* Load parameters for j particles */
+            /*     #for J in PARTICLES_ELEC_J */
+            jq{J}              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+{J},charge+jnrB+{J},
+                                                              charge+jnrC+{J},charge+jnrD+{J});
+            /*         #if KERNEL_ELEC=='GeneralizedBorn' */
+            isaj{J}            = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+{J},invsqrta+jnrB+{J},
+                                                              invsqrta+jnrC+{J},invsqrta+jnrD+{J});
+            /*         #endif */
+            /*     #endfor */
+            /*     #for J in PARTICLES_VDW_J */
+            vdwjidx{J}A        = 2*vdwtype[jnrA+{J}];
+            vdwjidx{J}B        = 2*vdwtype[jnrB+{J}];
+            vdwjidx{J}C        = 2*vdwtype[jnrC+{J}];
+            vdwjidx{J}D        = 2*vdwtype[jnrD+{J}];
+            /*     #endfor */
+            /* #endif */
+
+            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_J */
+            /*     #for J in PARTICLES_J */
+            fjx{J}             = _mm_setzero_ps();
+            fjy{J}             = _mm_setzero_ps();
+            fjz{J}             = _mm_setzero_ps();
+            /*     #endfor */
+            /* #endif */
+
+            /* #for I,J in PAIRS_IJ */
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            /*         ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
+            if (gmx_mm_any_lt(rsq{I}{J},rcutoff2))
+            {
+                /*     #if 0    ## this and the next two lines is a hack to maintain auto-indentation in template file */
+            }
+            /*         #endif */
+            /*         #define INNERFLOPS INNERFLOPS+1 */
+            /*     #endif */
+
+            /*     #if 'r' in INTERACTION_FLAGS[I][J] */
+            r{I}{J}              = _mm_mul_ps(rsq{I}{J},rinv{I}{J});
+            /*         #if ROUND == 'Epilogue' */
+            r{I}{J}              = _mm_andnot_ps(dummy_mask,r{I}{J});
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*         #endif */
+            /*         #define INNERFLOPS INNERFLOPS+1 */
+            /*     #endif */
+
+            /*     ## For water geometries we already loaded parameters at the start of the kernel */
+            /*     #if not 'Water' in GEOMETRY_J */
+            /* Compute parameters for interactions between i and j atoms */
+            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+            qq{I}{J}             = _mm_mul_ps(iq{I},jq{J});
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*         #endif */
+            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset{I}+vdwjidx{J}A,
+                                         vdwparam+vdwioffset{I}+vdwjidx{J}B,
+                                         vdwparam+vdwioffset{I}+vdwjidx{J}C,
+                                         vdwparam+vdwioffset{I}+vdwjidx{J}D,
+                                         &c6_{I}{J},&c12_{I}{J});
+            /*         #endif */
+            /*     #endif */
+
+            /*     #if 'table' in INTERACTION_FLAGS[I][J] */
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _mm_mul_ps(r{I}{J},vftabscale);
+            vfitab           = _mm_cvttps_epi32(rt);
+            vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
+            /*         #define INNERFLOPS INNERFLOPS+4                          */
+            /*         #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW     */
+            /*             ## 3 tables, 4 bytes per point: multiply index by 12 */
+            vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
+            /*         #elif 'Table' in KERNEL_ELEC                             */
+            /*             ## 1 table, 4 bytes per point: multiply index by 4   */
+            vfitab           = _mm_slli_epi32(vfitab,2);
+            /*         #elif 'Table' in KERNEL_VDW                              */
+            /*             ## 2 tables, 4 bytes per point: multiply index by 8  */
+            vfitab           = _mm_slli_epi32(vfitab,3);
+            /*         #endif                                                   */
+            /*     #endif */
+
+            /*     ## ELECTROSTATIC INTERACTIONS */
+            /*     #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+
+            /*         #if KERNEL_ELEC=='Coulomb' */
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _mm_mul_ps(qq{I}{J},rinv{I}{J});
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*             #if 'Force' in KERNEL_VF */
+            felec            = _mm_mul_ps(velec,rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+2 */
+            /*             #endif */
+
+            /*         #elif KERNEL_ELEC=='ReactionField' */
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            /*             #if 'Potential' in KERNEL_VF */
+            velec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(_mm_add_ps(rinv{I}{J},_mm_mul_ps(krf,rsq{I}{J})),crf));
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+            /*             #if 'Force' in KERNEL_VF */
+            felec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(_mm_mul_ps(rinv{I}{J},rinvsq{I}{J}),krf2));
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*             #endif */
+
+            /*         #elif KERNEL_ELEC=='GeneralizedBorn' */
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _mm_mul_ps(isai{I},isaj{J});
+            gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq{I}{J},_mm_mul_ps(isaprod,gbinvepsdiff)));
+            gbscale          = _mm_mul_ps(isaprod,gbtabscale);
+            dvdaj            = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+{J},dvda+jnrB+{J},dvda+jnrC+{J},dvda+jnrD+{J});
+            /*             #define INNERFLOPS INNERFLOPS+5 */
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _mm_mul_ps(r{I}{J},gbscale);
+            gbitab           = _mm_cvttps_epi32(rt);
+            gbeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
+            gbitab           = _mm_slli_epi32(gbitab,2);
+
+            Y                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
+            F                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
+            G                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
+            H                = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(gbeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
+            VV               = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
+            vgb              = _mm_mul_ps(gbqqfactor,VV);
+            /*             #define INNERFLOPS INNERFLOPS+10 */
+
+            /*             #if 'Force' in KERNEL_VF */
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
+            dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r{I}{J})));
+            dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
+            gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
+                                          _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj{J},isaj{J})));
+            /*                 #define INNERFLOPS INNERFLOPS+13 */
+            /*             #endif */
+            velec            = _mm_mul_ps(qq{I}{J},rinv{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #if 'Force' in KERNEL_VF */
+            felec            = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv{I}{J}),fgb),rinv{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*             #endif */
+
+            /*         #elif KERNEL_ELEC=='Ewald' */
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r{I}{J},ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            /*             #define INNERFLOPS INNERFLOPS+4 */
+            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_ELEC=='PotentialSwitch' */
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            /*                 #define INNERFLOPS INNERFLOPS+2 */
+            /*                 #if KERNEL_MOD_ELEC=='PotentialShift' */
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(_mm_sub_ps(rinv{I}{J},sh_ewald),velec));
+            /*                     #define INNERFLOPS INNERFLOPS+7 */
+            /*                 #else */
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq{I}{J},_mm_sub_ps(rinv{I}{J},velec));
+            /*                     #define INNERFLOPS INNERFLOPS+6 */
+            /*                 #endif */
+            /*                 #if 'Force' in KERNEL_VF */
+            felec            = _mm_mul_ps(_mm_mul_ps(qq{I}{J},rinv{I}{J}),_mm_sub_ps(rinvsq{I}{J},felec));
+            /*                      #define INNERFLOPS INNERFLOPS+3 */
+            /*                 #endif */
+            /*             #elif KERNEL_VF=='Force' */
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq{I}{J},rinv{I}{J}),_mm_sub_ps(rinvsq{I}{J},felec));
+            /*                 #define INNERFLOPS INNERFLOPS+7 */
+            /*             #endif */
+
+            /*         #elif KERNEL_ELEC=='CubicSplineTable' */
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            /*             #define INNERFLOPS INNERFLOPS+4 */
+            /*             #if 'Potential' in KERNEL_VF */
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            velec            = _mm_mul_ps(qq{I}{J},VV);
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*             #endif */
+            /*             #if 'Force' in KERNEL_VF */
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq{I}{J},FF),_mm_mul_ps(vftabscale,rinv{I}{J})));
+            /*                 #define INNERFLOPS INNERFLOPS+7 */
+            /*             #endif */
+            /*         #endif */
+            /*         ## End of check for electrostatics interaction forms */
+            /*     #endif */
+            /*     ## END OF ELECTROSTATIC INTERACTION CHECK FOR PAIR I-J */
+
+            /*     #if 'vdw' in INTERACTION_FLAGS[I][J] */
+
+            /*         #if KERNEL_VDW=='LennardJones' */
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
+            /*             #define INNERFLOPS INNERFLOPS+2 */
+            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
+            vvdw6            = _mm_mul_ps(c6_{I}{J},rinvsix);
+            vvdw12           = _mm_mul_ps(c12_{I}{J},_mm_mul_ps(rinvsix,rinvsix));
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_{I}{J},_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_{I}{J},sh_vdw_invrcut6)),one_sixth));
+            /*                     #define INNERFLOPS INNERFLOPS+8 */
+            /*                 #else */
+            vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
+            /*                     #define INNERFLOPS INNERFLOPS+3 */
+            /*                 #endif */
+            /*                 ## Check for force inside potential check, i.e. this means we already did the potential part */
+            /*                 #if 'Force' in KERNEL_VF */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq{I}{J});
+            /*                     #define INNERFLOPS INNERFLOPS+2 */
+            /*                 #endif */
+            /*             #elif KERNEL_VF=='Force' */
+            /*                 ## Force-only LennardJones makes it possible to save 1 flop (they do add up...) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_{I}{J},rinvsix),c6_{I}{J}),_mm_mul_ps(rinvsix,rinvsq{I}{J}));
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+
+            /*         #elif KERNEL_VDW=='CubicSplineTable' */
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            /*             #if 'Table' in KERNEL_ELEC */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            /*             #endif                     */
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            /*             #define INNERFLOPS INNERFLOPS+4 */
+            /*             #if 'Potential' in KERNEL_VF */
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw6            = _mm_mul_ps(c6_{I}{J},VV);
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*             #endif */
+            /*             #if 'Force' in KERNEL_VF */
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw6            = _mm_mul_ps(c6_{I}{J},FF);
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            vfitab           = _mm_add_epi32(vfitab,ifour);
+            Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
+            F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
+            G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
+            H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            Heps             = _mm_mul_ps(vfeps,H);
+            Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
+            /*             #define INNERFLOPS INNERFLOPS+4 */
+            /*             #if 'Potential' in KERNEL_VF */
+            VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
+            vvdw12           = _mm_mul_ps(c12_{I}{J},VV);
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*             #endif */
+            /*             #if 'Force' in KERNEL_VF */
+            FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
+            fvdw12           = _mm_mul_ps(c12_{I}{J},FF);
+            /*                 #define INNERFLOPS INNERFLOPS+5 */
+            /*             #endif */
+            /*             #if 'Potential' in KERNEL_VF */
+            vvdw             = _mm_add_ps(vvdw12,vvdw6);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif */
+            /*             #if 'Force' in KERNEL_VF */
+            fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv{I}{J})));
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+            /*         #endif */
+            /*         ## End of check for vdw interaction forms */
+            /*     #endif */
+            /*     ## END OF VDW INTERACTION CHECK FOR PAIR I-J */
+
+            /*     #if 'switch' in INTERACTION_FLAGS[I][J] */
+            d                = _mm_sub_ps(r{I}{J},rswitch);
+            d                = _mm_max_ps(d,_mm_setzero_ps());
+            d2               = _mm_mul_ps(d,d);
+            sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
+            /*         #define INNERFLOPS INNERFLOPS+10 */
+
+            /*         #if 'Force' in KERNEL_VF */
+            dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
+            /*             #define INNERFLOPS INNERFLOPS+5 */
+            /*         #endif */
+
+            /* Evaluate switch function */
+            /*         #if 'Force' in KERNEL_VF */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
+            felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv{I}{J},_mm_mul_ps(velec,dsw)) );
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
+            fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv{I}{J},_mm_mul_ps(vvdw,dsw)) );
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+            /*         #endif */
+            /*         #if 'Potential' in KERNEL_VF */
+            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
+            velec            = _mm_mul_ps(velec,sw);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif */
+            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
+            vvdw             = _mm_mul_ps(vvdw,sw);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif */
+            /*         #endif */
+            /*     #endif */
+            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            cutoff_mask      = _mm_cmplt_ps(rsq{I}{J},rcutoff2);
+            /*         #define INNERFLOPS INNERFLOPS+1 */
+            /*     #endif */
+
+            /*     #if 'Potential' in KERNEL_VF */
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif                                       */
+            /*             #if ROUND == 'Epilogue' */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            /*             #endif */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*             #if KERNEL_ELEC=='GeneralizedBorn' */
+            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            vgb              = _mm_and_ps(vgb,cutoff_mask);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif                                       */
+            /*             #if ROUND == 'Epilogue' */
+            vgb              = _mm_andnot_ps(dummy_mask,vgb);
+            /*             #endif */
+            vgbsum           = _mm_add_ps(vgbsum,vgb);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif */
+            /*         #endif */
+            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif                                       */
+            /*             #if ROUND == 'Epilogue' */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            /*             #endif */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*         #endif */
+            /*     #endif */
+
+            /*     #if 'Force' in KERNEL_VF */
+
+            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] and 'vdw' in INTERACTION_FLAGS[I][J] */
+            fscal            = _mm_add_ps(felec,fvdw);
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*         #elif 'electrostatics' in INTERACTION_FLAGS[I][J] */
+            fscal            = felec;
+            /*         #elif 'vdw' in INTERACTION_FLAGS[I][J] */
+            fscal            = fvdw;
+            /*        #endif */
+
+            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif                                       */
+
+            /*             #if ROUND == 'Epilogue' */
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+            /*             #endif */
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx{I}{J});
+            ty               = _mm_mul_ps(fscal,dy{I}{J});
+            tz               = _mm_mul_ps(fscal,dz{I}{J});
+
+            /* Update vectorial force */
+            fix{I}             = _mm_add_ps(fix{I},tx);
+            fiy{I}             = _mm_add_ps(fiy{I},ty);
+            fiz{I}             = _mm_add_ps(fiz{I},tz);
+            /*             #define INNERFLOPS INNERFLOPS+6 */
+
+            /* #if GEOMETRY_J == 'Particle'             */
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   tx,ty,tz);
+            /*     #define INNERFLOPS INNERFLOPS+3      */
+            /* #else                                    */
+            fjx{J}             = _mm_add_ps(fjx{J},tx);
+            fjy{J}             = _mm_add_ps(fjy{J},ty);
+            fjz{J}             = _mm_add_ps(fjz{J},tz);
+            /*     #define INNERFLOPS INNERFLOPS+3      */
+            /* #endif                                   */
+
+            /*     #endif */
+
+            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            /*         #if 0    ## This and next two lines is a hack to maintain indentation in template file */
+            {
+                /*     #endif */
+            }
+            /*     #endif */
+            /*    ## End of check for the interaction being outside the cutoff */
+
+            /* #endfor */
+            /* ## End of loop over i-j interaction pairs */
+
+            /* #if GEOMETRY_J == 'Water3'               */
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+            /*     #define INNERFLOPS INNERFLOPS+9      */
+            /* #elif GEOMETRY_J == 'Water4'             */
+            /*     #if 0 in PARTICLES_J                 */
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+            /*     #define INNERFLOPS INNERFLOPS+12     */
+            /*     #else                                */
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
+                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+                                                   fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+            /*     #define INNERFLOPS INNERFLOPS+9      */
+            /*     #endif                               */
+            /* #endif                                   */
+
+            /* Inner loop uses {INNERFLOPS} flops */
+        }
+
+        /* #endfor */
+
+        /* End of innermost loop */
+
+        /* #if 'Force' in KERNEL_VF */
+        /*     #if GEOMETRY_I == 'Particle'            */
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+        /*         #define OUTERFLOPS OUTERFLOPS+6     */
+        /*     #elif GEOMETRY_I == 'Water3'            */
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+        /*         #define OUTERFLOPS OUTERFLOPS+18    */
+        /*     #elif GEOMETRY_I == 'Water4'            */
+        /*         #if 0 in PARTICLES_I                */
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+        /*             #define OUTERFLOPS OUTERFLOPS+24    */
+        /*         #else                               */
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+        /*             #define OUTERFLOPS OUTERFLOPS+18    */
+        /*         #endif                              */
+        /*     #endif                                  */
+        /* #endif                                      */
+
+        /* #if 'Potential' in KERNEL_VF */
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        /*     #if KERNEL_ELEC != 'None' */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        /*         #define OUTERFLOPS OUTERFLOPS+1 */
+        /*     #endif */
+        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
+        gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
+        /*         #define OUTERFLOPS OUTERFLOPS+1 */
+        /*     #endif */
+        /*     #if KERNEL_VDW != 'None' */
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+        /*         #define OUTERFLOPS OUTERFLOPS+1 */
+        /*     #endif */
+        /* #endif */
+        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
+        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai{I},isai{I}));
+        gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
+        /*     #endif */
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses {OUTERFLOPS} flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+    /* ## NB: This is not important, it just affects the flopcount. However, since our preprocessor is */
+    /* ## primitive and replaces aggressively even in strings inside these directives, we need to      */
+    /* ## assemble the main part of the name (containing KERNEL/ELEC/VDW) directly in the source.      */
+    /* #if GEOMETRY_I == 'Water3'            */
+    /*     #define ISUFFIX '_W3'             */
+    /* #elif GEOMETRY_I == 'Water4'          */
+    /*     #define ISUFFIX '_W4'             */
+    /* #else                                 */
+    /*     #define ISUFFIX ''                */
+    /* #endif                                */
+    /* #if GEOMETRY_J == 'Water3'            */
+    /*     #define JSUFFIX 'W3'              */
+    /* #elif GEOMETRY_J == 'Water4'          */
+    /*     #define JSUFFIX 'W4'              */
+    /* #else                                 */
+    /*     #define JSUFFIX ''                */
+    /* #endif                                */
+    /* #if 'PotentialAndForce' in KERNEL_VF  */
+    /*     #define VFSUFFIX  '_VF'           */
+    /* #elif 'Potential' in KERNEL_VF        */
+    /*     #define VFSUFFIX '_V'             */
+    /* #else                                 */
+    /*     #define VFSUFFIX '_F'             */
+    /* #endif                                */
+
+    /* #if KERNEL_ELEC != 'None' and KERNEL_VDW != 'None' */
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
+    /* #elif KERNEL_ELEC != 'None' */
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
+    /* #else */
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
+    /* #endif  */
+}
index 93eae2be498beed32c9b7324ec69292685855664..96b9e4c1447bcbc9eb7742b8a5175136c45d5da0 100644 (file)
 /* Different default (c) and accelerated interaction-specific kernels */
 #include "nb_kernel_c/nb_kernel_c.h"
 
+/* Temporary enabler until we add the AVX kernels */
+#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) || (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) || (defined GMX_CPU_ACCELERATION_X86_AVX_256)
+#    define GMX_CPU_ACCELERATION_X86_SSE2
+#endif
+
+#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
+#    include "nb_kernel_sse2_single/nb_kernel_sse2_single.h"
+#endif
+
+
 #ifdef GMX_THREAD_MPI
 static tMPI_Thread_mutex_t nonbonded_setup_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
 #endif
@@ -94,9 +104,12 @@ gmx_nonbonded_setup(FILE *         fplog,
             /* Add the generic kernels to the structure stored statically in nb_kernel.c */
             nb_kernel_list_add_kernels(kernellist_c,kernellist_c_size);
             
-            if(fr->use_cpu_acceleration==TRUE)
+            if(!(fr!=NULL && fr->use_cpu_acceleration==FALSE))
             {
                 /* Add interaction-specific kernels for different architectures */
+#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
+                nb_kernel_list_add_kernels(kernellist_sse2_single,kernellist_sse2_single_size);
+#endif
                 ; /* empty statement to avoid a completely empty block */
             }
         }
@@ -123,11 +136,19 @@ gmx_nonbonded_set_kernel_pointers(FILE *log, t_nblist *nl)
     const char *     other;
     const char *     vf;
 
-    const char *     arch[] =
+    struct
     {
-        "c"
+        const char *  arch;
+        int           simd_padding_width;
+    }
+    arch_and_padding[] =
+    {
+#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
+        { "sse2_single", 4 },
+#endif
+        { "c", 1 },
     };
-    int              narch = asize(arch);
+    int              narch = asize(arch_and_padding);
     int              i;
 
     if(nonbonded_setup_done==FALSE)
@@ -155,11 +176,13 @@ gmx_nonbonded_set_kernel_pointers(FILE *log, t_nblist *nl)
     {
         nl->kernelptr_vf = gmx_nb_free_energy_kernel;
         nl->kernelptr_f  = gmx_nb_free_energy_kernel;
+        nl->simd_padding_width = 1;
     }
     else if(!gmx_strcasecmp_min(geom,"CG-CG"))
     {
         nl->kernelptr_vf = gmx_nb_generic_cg_kernel;
         nl->kernelptr_f  = gmx_nb_generic_cg_kernel;
+        nl->simd_padding_width = 1;
     }
     else
     {
@@ -167,23 +190,28 @@ gmx_nonbonded_set_kernel_pointers(FILE *log, t_nblist *nl)
 
         for(i=0;i<narch && nl->kernelptr_vf==NULL ;i++)
         {
-               nl->kernelptr_vf = nb_kernel_list_findkernel(log,arch[i],elec,elec_mod,vdw,vdw_mod,geom,other,"PotentialAndForce");
+            nl->kernelptr_vf = nb_kernel_list_findkernel(log,arch_and_padding[i].arch,elec,elec_mod,vdw,vdw_mod,geom,other,"PotentialAndForce");
+            nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
         }
         for(i=0;i<narch && nl->kernelptr_f==NULL ;i++)
         {
-            nl->kernelptr_f  = nb_kernel_list_findkernel(log,arch[i],elec,elec_mod,vdw,vdw_mod,geom,other,"Force");
+            nl->kernelptr_f = nb_kernel_list_findkernel(log,arch_and_padding[i].arch,elec,elec_mod,vdw,vdw_mod,geom,other,"Force");
+            nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
+
             /* If there is not force-only optimized kernel, is there a potential & force one? */
             if(nl->kernelptr_f == NULL)
             {
-                nl->kernelptr_f  = nb_kernel_list_findkernel(NULL,arch[i],elec,elec_mod,vdw,vdw_mod,geom,other,"PotentialAndForce");
+                nl->kernelptr_f  = nb_kernel_list_findkernel(NULL,arch_and_padding[i].arch,elec,elec_mod,vdw,vdw_mod,geom,other,"PotentialAndForce");
+                nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
             }
         }
-
+        
         /* Give up, pick a generic one instead */
         if(nl->kernelptr_vf==NULL)
         {
             nl->kernelptr_vf = gmx_nb_generic_kernel;
             nl->kernelptr_f  = gmx_nb_generic_kernel;
+            nl->simd_padding_width = 1;
             if(log)
             {
                 fprintf(log,
index 7e17b8da3203cfae775503f2fd7a9428d917bbb3..4e05effd83d03479febec25a876ddc2979130348 100644 (file)
@@ -235,12 +235,6 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
     double          cycles_pmes;
     gmx_bool        bPMETuneTry=FALSE,bPMETuneRunning=FALSE;
 
-    if(MASTER(cr))
-    {
-        gmx_warning("New C kernels (and force-only) kernels are now enabled,\n"
-                    "but it will be another couple of days for SSE/AVX kernels.\n\n");
-    }
-
 #ifdef GMX_FAHCORE
     /* Temporary addition for FAHCORE checkpointing */
     int chkpt_ret;
index d967c87c397ce003e90ca16119ce18a71d6b523a..f86c6afdc244a8e44d2c37a73e003a14f233729e 100644 (file)
@@ -95,7 +95,7 @@ void ns(FILE *fp,
 
     nsearch = search_neighbours(fp,fr,x,box,top,groups,cr,nrnb,md,
                                 lambda,dvdlambda,grppener,
-                                bFillGrid,bDoLongRangeNS);
+                                bFillGrid,bDoLongRangeNS,TRUE);
   if (debug)
     fprintf(debug,"nsearch = %d\n",nsearch);
 
index eb842681126e619ce1a1067668871a7f15b71afc..abaa1b91e2b90dbc7324b7d1c679cf8a2a809776 100644 (file)
@@ -80,6 +80,15 @@ static gmx_bool NOTEXCL_(t_excl e[],atom_id i,atom_id j)
 #define NOTEXCL(e,i,j) !(ISEXCL(e,i,j))
 #endif
 
+static int
+round_up_to_simd_width(int length, int simd_width)
+{
+    int offset,newlength;
+    
+    offset = (simd_width>0) ? length % simd_width : 0;
+
+    return (offset==0) ? length : length-offset+simd_width;
+}
 /************************************************
  *
  *  U T I L I T I E S    F O R    N S
@@ -140,6 +149,7 @@ static void init_nblist(FILE *log, t_nblist *nl_sr,t_nblist *nl_lr,
             nl->igeometry  = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE;
         }
         
+        /* This will also set the simd_padding_width field */
         gmx_nonbonded_set_kernel_pointers( (i==0) ? log : NULL,nl);
         
         /* maxnri is influenced by the number of shifts (maximum is 8)
@@ -384,6 +394,15 @@ static inline void close_i_nblist(t_nblist *nlist)
     
     if (nri >= 0)
     {
+        /* Add elements up to padding. Since we allocate memory in units
+         * of the simd_padding width, we do not have to check for possible
+         * list reallocation here.
+         */
+        while((nlist->nrj % nlist->simd_padding_width)!=0)
+        {
+            /* Use -4 here, so we can write forces for 4 atoms before real data */
+            nlist->jjnr[nlist->nrj++]=-4;
+        }
         nlist->jindex[nri+1] = nlist->nrj;
         
         len=nlist->nrj -  nlist->jindex[nri];
@@ -446,7 +465,8 @@ static inline void add_j_to_nblist(t_nblist *nlist,atom_id j_atom,gmx_bool bLR)
     
     if (nlist->nrj >= nlist->maxnrj)
     {
-        nlist->maxnrj = over_alloc_small(nlist->nrj + 1);
+        nlist->maxnrj = round_up_to_simd_width(over_alloc_small(nlist->nrj + 1),nlist->simd_padding_width);
+        
         if (gmx_debug_at)
             fprintf(debug,"Increasing %s nblist (ielec=%d,ivdw=%d,free=%d,igeometry=%d) j size to %d\n",
                     bLR ? "LR" : "SR",nlist->ielec,nlist->ivdw,nlist->free_energy,nlist->igeometry,nlist->maxnrj);
@@ -2362,7 +2382,8 @@ int search_neighbours(FILE *log,t_forcerec *fr,
                       real *lambda,real *dvdlambda,
                       gmx_grppairener_t *grppener,
                       gmx_bool bFillGrid,
-                      gmx_bool bDoLongRangeNS)
+                      gmx_bool bDoLongRangeNS,
+                      gmx_bool bPadListsForKernels)
 {
     t_block  *cgs=&(top->cgs);
     rvec     box_size,grid_x0,grid_x1;
index 0c4337e169d6ab4a5862897f5be7697d2bfd2048..c799b750c472ed5bc1712e3bff27abc2d5d11427 100644 (file)
@@ -283,7 +283,7 @@ void do_nsgrid(FILE *fp,gmx_bool bVerbose,
   snew(dvdl,efptNR);
   init_neighbor_list(fp,fr,md->homenr);
   search_neighbours(fp,fr,x,box,top,
-                   &mtop->groups,cr,&nrnb,md,lambda,dvdl,NULL,TRUE,FALSE);
+                   &mtop->groups,cr,&nrnb,md,lambda,dvdl,NULL,TRUE,FALSE,FALSE);
 
   if (debug)
     dump_nblist(debug,cr,fr,0);